In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import entropy
In [2]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[2]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [3]:
# Showing the pairings of countries based on total population (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Switzerland = df[(df.location == "Switzerland")]

df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]

df_Bulgaria = df[(df.location == "Bulgaria")]
df_Serbia = df[(df.location == "Serbia")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Czechia = df[(df.location == "Czechia")]
df_Romania = df[(df.location == "Romania")]

df_Denmark = df[(df.location == "Denmark")]
df_Ireland = df[(df.location == "Ireland")]

df_Estonia = df[(df.location == "Estonia")]
df_Latvia = df[(df.location == "Latvia")]

df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]

df_France = df[(df.location == "France")]
df_Italy = df[(df.location == "Italy")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Sweden = df[(df.location == "Sweden")]

df_Portugal = df[(df.location == "Portugal")]
df_Spain = df[(df.location == "Spain")]

df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
In [4]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [5]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [6]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[6]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [7]:
country1 = 'Austria'
country2 = 'Switzerland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [8]:
df_updated
Out[8]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322922
14645 Switzerland 12/26/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322922
14646 Switzerland 12/27/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322922
14647 Switzerland 12/28/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.323082
14648 Switzerland 12/29/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322149

2078 rows × 10 columns

In [9]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [10]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [11]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[11]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [12]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [13]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [14]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [15]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [16]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[16]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [17]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [18]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [19]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [20]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991699923252158
In [21]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [22]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0013425167575155834
R2 Score: 0.9992626021702602
RMSE: 0.036640
Entropy Value: 0.00019452803666383865
In [23]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[23]:
feature importance
1 diabetes_prevalence 0.928385
6 median_age 0.045537
2 female_smokers 0.012387
0 cardiovasc_death_rate 0.009839
5 aged_65_older 0.002963
3 male_smokers 0.000738
4 life_expectancy 0.000151
In [24]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[24]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [25]:
country1 = 'Austria'
country2 = 'Switzerland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [26]:
df_updated
Out[26]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.70 45436.686 106.749 0.000000
1 Austria 2/26/2020 7.37 0.922 0.70 45436.686 106.749 0.000000
2 Austria 2/27/2020 7.37 0.922 0.70 45436.686 106.749 0.000000
3 Austria 2/28/2020 7.37 0.922 0.70 45436.686 106.749 0.000000
4 Austria 2/29/2020 7.37 0.922 0.70 45436.686 106.749 0.000000
... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 4.53 0.955 0.03 57410.166 214.243 0.322922
14645 Switzerland 12/26/2022 4.53 0.955 0.03 57410.166 214.243 0.322922
14646 Switzerland 12/27/2022 4.53 0.955 0.03 57410.166 214.243 0.322922
14647 Switzerland 12/28/2022 4.53 0.955 0.03 57410.166 214.243 0.323082
14648 Switzerland 12/29/2022 4.53 0.955 0.03 57410.166 214.243 0.322149

2078 rows × 8 columns

In [27]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [28]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [29]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[29]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [30]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [31]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [32]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [33]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [34]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[34]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [35]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [36]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [37]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [38]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9974454800061979
In [39]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [40]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0020885572459370403
R2 Score: 0.9988528280397101
RMSE: 0.045701
Entropy Value: 0.0004103351755030901
In [41]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[41]:
feature importance
1 human_development_index 0.860290
0 hospital_beds_per_thousand 0.062671
2 extreme_poverty 0.042865
3 gdp_per_capita 0.032818
4 population_density 0.001356
In [42]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[42]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [43]:
country1 = 'Belgium'
country2 = 'Canada'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [44]:
df_updated
Out[44]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1040 Belgium 2/5/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1041 Belgium 2/6/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1042 Belgium 2/7/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1043 Belgium 2/8/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
... ... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2132 rows × 10 columns

In [45]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [46]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [47]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[47]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [48]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [49]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [50]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [51]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [52]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[52]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [53]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [54]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [55]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [56]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989190282407456
In [57]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [58]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.015185943246172551
R2 Score: 0.9988516813120202
RMSE: 0.123231
Entropy Value: 0.0003900318909582723
In [59]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[59]:
feature importance
1 diabetes_prevalence 0.852780
0 cardiovasc_death_rate 0.064945
5 aged_65_older 0.033227
6 median_age 0.028916
2 female_smokers 0.016257
3 male_smokers 0.003580
4 life_expectancy 0.000296
In [60]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[60]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [61]:
country1 = 'Belgium'
country2 = 'Canada'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [62]:
df_updated
Out[62]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 375.564 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 375.564 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 375.564 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 375.564 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 375.564 0.000000
... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.50 0.929 0.5 44017.591 4.037 1.092509
15717 Canada 12/26/2022 2.50 0.929 0.5 44017.591 4.037 1.092338
15718 Canada 12/27/2022 2.50 0.929 0.5 44017.591 4.037 1.092196
15719 Canada 12/28/2022 2.50 0.929 0.5 44017.591 4.037 1.092321
15720 Canada 12/29/2022 2.50 0.929 0.5 44017.591 4.037 1.093162

2132 rows × 8 columns

In [63]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [64]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [65]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[65]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [66]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [67]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [68]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [69]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [70]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[70]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [71]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [72]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [73]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [74]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987706326919069
In [75]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [76]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.018375766572997825
R2 Score: 0.9986104757656693
RMSE: 0.135557
Entropy Value: 0.0005417545801711733
In [77]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[77]:
feature importance
1 human_development_index 0.859285
0 hospital_beds_per_thousand 0.098572
2 extreme_poverty 0.027225
3 gdp_per_capita 0.013788
4 population_density 0.001130
In [78]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[78]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [79]:
country1 = 'Bulgaria'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [80]:
df_updated
Out[80]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.717058
16755 Serbia 12/26/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716963
16756 Serbia 12/27/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716677
16757 Serbia 12/28/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716395
16758 Serbia 12/29/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716205

2065 rows × 10 columns

In [81]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [82]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [83]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[83]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [84]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [85]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [86]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [87]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [88]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[88]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [89]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [90]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [91]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [92]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9604734653614051
In [93]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [94]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0021235139380423036
R2 Score: 0.9989608533920085
RMSE: 0.046082
Entropy Value: 0.00020835376815230597
In [95]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[95]:
feature importance
0 cardiovasc_death_rate 0.583303
5 aged_65_older 0.190242
6 median_age 0.118268
2 female_smokers 0.043382
1 diabetes_prevalence 0.043157
4 life_expectancy 0.011155
3 male_smokers 0.010494
In [96]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[96]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [97]:
country1 = 'Bulgaria'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [98]:
df_updated
Out[98]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.50 18563.307 65.180 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.50 18563.307 65.180 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.50 18563.307 65.180 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.50 18563.307 65.180 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.50 18563.307 65.180 14.285714
... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 5.609 0.806 0.05 14048.881 80.291 0.717058
16755 Serbia 12/26/2022 5.609 0.806 0.05 14048.881 80.291 0.716963
16756 Serbia 12/27/2022 5.609 0.806 0.05 14048.881 80.291 0.716677
16757 Serbia 12/28/2022 5.609 0.806 0.05 14048.881 80.291 0.716395
16758 Serbia 12/29/2022 5.609 0.806 0.05 14048.881 80.291 0.716205

2065 rows × 8 columns

In [99]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [100]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [101]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[101]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [102]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [103]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [104]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [105]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [106]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[106]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [107]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [108]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [109]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [110]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9604140465460519
In [111]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [112]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006126929070284454
R2 Score: 0.9970017726529921
RMSE: 0.078275
Entropy Value: 0.0006566407159103827
In [113]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[113]:
feature importance
0 hospital_beds_per_thousand 0.649173
1 human_development_index 0.179720
2 extreme_poverty 0.087485
4 population_density 0.042793
3 gdp_per_capita 0.040829
In [114]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[114]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [115]:
country1 = 'Cyprus'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [116]:
df_updated
Out[116]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17796 Luxembourg 12/26/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17797 Luxembourg 12/27/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17798 Luxembourg 12/28/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17799 Luxembourg 12/29/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872

2068 rows × 10 columns

In [117]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [118]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [119]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[119]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [120]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [121]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [122]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [123]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [124]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[124]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [125]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [126]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [127]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [128]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9949062193563151
In [129]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [130]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0010948126043919594
R2 Score: 0.9971226073313326
RMSE: 0.033088
Entropy Value: 0.0007353489441030162
In [131]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[131]:
feature importance
1 diabetes_prevalence 0.692885
0 cardiovasc_death_rate 0.156869
6 median_age 0.062790
5 aged_65_older 0.033893
2 female_smokers 0.031649
3 male_smokers 0.012797
4 life_expectancy 0.009117
In [132]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[132]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [133]:
country1 = 'Cyprus'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [134]:
df_updated
Out[134]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 0.15 32415.132 127.657 0.000000
3127 Cyprus 3/9/2020 3.40 0.887 0.15 32415.132 127.657 0.000000
3128 Cyprus 3/10/2020 3.40 0.887 0.15 32415.132 127.657 0.000000
3129 Cyprus 3/11/2020 3.40 0.887 0.15 32415.132 127.657 0.000000
3130 Cyprus 3/12/2020 3.40 0.887 0.15 32415.132 127.657 0.000000
... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.51 0.916 0.20 94277.965 231.447 0.377872
17796 Luxembourg 12/26/2022 4.51 0.916 0.20 94277.965 231.447 0.377872
17797 Luxembourg 12/27/2022 4.51 0.916 0.20 94277.965 231.447 0.377872
17798 Luxembourg 12/28/2022 4.51 0.916 0.20 94277.965 231.447 0.377872
17799 Luxembourg 12/29/2022 4.51 0.916 0.20 94277.965 231.447 0.377872

2068 rows × 8 columns

In [135]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [136]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [137]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[137]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [138]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [139]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [140]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [141]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [142]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[142]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [143]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [144]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [145]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [146]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9949940053151926
In [147]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [148]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0014876690956331902
R2 Score: 0.9960900996828079
RMSE: 0.038570
Entropy Value: 0.0011182439482911944
In [149]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[149]:
feature importance
1 human_development_index 0.638180
0 hospital_beds_per_thousand 0.134692
2 extreme_poverty 0.104035
4 population_density 0.089664
3 gdp_per_capita 0.033430
In [150]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[150]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [151]:
country1 = 'Czechia'
country2 = 'Romania'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [152]:
df_updated
Out[152]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
4153 Czechia 3/1/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4154 Czechia 3/2/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4155 Czechia 3/3/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4156 Czechia 3/4/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4157 Czechia 3/5/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403

2072 rows × 10 columns

In [153]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [154]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [155]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[155]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [156]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [157]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [158]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [159]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [160]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[160]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [161]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [162]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [163]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [164]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987279834799028
In [165]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [166]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0035563501611279766
R2 Score: 0.9975591549551829
RMSE: 0.059635
Entropy Value: 0.00035041497875719764
In [167]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[167]:
feature importance
0 cardiovasc_death_rate 0.598329
1 diabetes_prevalence 0.185027
5 aged_65_older 0.127458
6 median_age 0.069055
2 female_smokers 0.016719
3 male_smokers 0.002718
4 life_expectancy 0.000695
In [168]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[168]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [169]:
country1 = 'Czechia'
country2 = 'Romania'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [170]:
df_updated
Out[170]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
4153 Czechia 3/1/2020 6.630 0.900 0.0 32605.906 137.176 0.000000
4154 Czechia 3/2/2020 6.630 0.900 0.0 32605.906 137.176 0.000000
4155 Czechia 3/3/2020 6.630 0.900 0.0 32605.906 137.176 0.000000
4156 Czechia 3/4/2020 6.630 0.900 0.0 32605.906 137.176 0.000000
4157 Czechia 3/5/2020 6.630 0.900 0.0 32605.906 137.176 0.000000
... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.7 23313.199 85.129 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.7 23313.199 85.129 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.7 23313.199 85.129 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.7 23313.199 85.129 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.7 23313.199 85.129 2.036403

2072 rows × 8 columns

In [171]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [172]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [173]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[173]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [174]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [175]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [176]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [177]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [178]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[178]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [179]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [180]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [181]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [182]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9980242375402296
In [183]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [184]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0037624148328907537
R2 Score: 0.9974177257060381
RMSE: 0.061339
Entropy Value: inf
In [185]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[185]:
feature importance
0 hospital_beds_per_thousand 0.862863
1 human_development_index 0.112243
2 extreme_poverty 0.019148
3 gdp_per_capita 0.004974
4 population_density 0.000772
In [186]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[186]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [187]:
country1 = 'Denmark'
country2 = 'Ireland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [188]:
df_updated
Out[188]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 19.3 18.8 80.9 19.677 42.3 0.000000
5188 Denmark 2/3/2020 114.767 6.41 19.3 18.8 80.9 19.677 42.3 0.000000
5189 Denmark 2/4/2020 114.767 6.41 19.3 18.8 80.9 19.677 42.3 0.000000
5190 Denmark 2/5/2020 114.767 6.41 19.3 18.8 80.9 19.677 42.3 0.000000
5191 Denmark 2/6/2020 114.767 6.41 19.3 18.8 80.9 19.677 42.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 126.459 3.28 23.0 25.7 82.3 13.928 38.7 0.491388
19869 Ireland 12/26/2022 126.459 3.28 23.0 25.7 82.3 13.928 38.7 0.491388
19870 Ireland 12/27/2022 126.459 3.28 23.0 25.7 82.3 13.928 38.7 0.491388
19871 Ireland 12/28/2022 126.459 3.28 23.0 25.7 82.3 13.928 38.7 0.491388
19872 Ireland 12/29/2022 126.459 3.28 23.0 25.7 82.3 13.928 38.7 0.491388

2097 rows × 10 columns

In [189]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [190]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [191]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[191]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [192]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [193]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [194]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [195]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [196]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[196]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [197]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [198]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [199]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [200]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991336553758403
In [201]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [202]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0039824430069437
R2 Score: 0.9985146782977569
RMSE: 0.063107
Entropy Value: 0.00082019994222745
In [203]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[203]:
feature importance
1 diabetes_prevalence 0.793221
0 cardiovasc_death_rate 0.123281
6 median_age 0.063364
2 female_smokers 0.012535
3 male_smokers 0.003768
5 aged_65_older 0.003571
4 life_expectancy 0.000260
In [204]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[204]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [205]:
country1 = 'Denmark'
country2 = 'Ireland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [206]:
df_updated
Out[206]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
5187 Denmark 2/2/2020 2.50 0.940 0.2 46682.515 136.520 0.000000
5188 Denmark 2/3/2020 2.50 0.940 0.2 46682.515 136.520 0.000000
5189 Denmark 2/4/2020 2.50 0.940 0.2 46682.515 136.520 0.000000
5190 Denmark 2/5/2020 2.50 0.940 0.2 46682.515 136.520 0.000000
5191 Denmark 2/6/2020 2.50 0.940 0.2 46682.515 136.520 0.000000
... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 2.96 0.955 0.2 67335.293 69.874 0.491388
19869 Ireland 12/26/2022 2.96 0.955 0.2 67335.293 69.874 0.491388
19870 Ireland 12/27/2022 2.96 0.955 0.2 67335.293 69.874 0.491388
19871 Ireland 12/28/2022 2.96 0.955 0.2 67335.293 69.874 0.491388
19872 Ireland 12/29/2022 2.96 0.955 0.2 67335.293 69.874 0.491388

2097 rows × 8 columns

In [207]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [208]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [209]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[209]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [210]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [211]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [212]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [213]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [214]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[214]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [215]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [216]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [217]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [218]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9989935922633931
In [219]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [220]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0036179936828483242
R2 Score: 0.9986506060409794
RMSE: 0.060150
Entropy Value: 0.000665447331955437
In [221]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[221]:
feature importance
1 human_development_index 0.731394
0 hospital_beds_per_thousand 0.224461
2 extreme_poverty 0.024741
3 gdp_per_capita 0.016963
4 population_density 0.002441
In [222]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[222]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [223]:
country1 = 'Estonia'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [224]:
df_updated
Out[224]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631969

2099 rows × 10 columns

In [225]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [226]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [227]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[227]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [228]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [229]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [230]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [231]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [232]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[232]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [233]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [234]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [235]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [236]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9983509839471967
In [237]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [238]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0011442225986659504
R2 Score: 0.998102656857225
RMSE: 0.033826
Entropy Value: inf
In [239]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[239]:
feature importance
1 diabetes_prevalence 0.764753
0 cardiovasc_death_rate 0.126479
5 aged_65_older 0.055045
6 median_age 0.033800
2 female_smokers 0.018890
3 male_smokers 0.000750
4 life_expectancy 0.000282
In [240]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[240]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [241]:
country1 = 'Estonia'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [242]:
df_updated
Out[242]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 0.5 29481.252 31.033 0.000000
6250 Estonia 1/18/2020 4.69 0.892 0.5 29481.252 31.033 0.000000
6251 Estonia 2/5/2020 4.69 0.892 0.5 29481.252 31.033 0.000000
6252 Estonia 2/6/2020 4.69 0.892 0.5 29481.252 31.033 0.000000
6253 Estonia 2/7/2020 4.69 0.892 0.5 29481.252 31.033 0.000000
... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.7 25063.846 31.212 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.7 25063.846 31.212 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.7 25063.846 31.212 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.7 25063.846 31.212 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.7 25063.846 31.212 0.631969

2099 rows × 8 columns

In [243]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [244]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [245]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[245]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [246]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [247]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [248]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [249]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [250]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[250]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [251]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [252]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [253]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [254]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9988172536777192
In [255]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [256]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0011498507246205968
R2 Score: 0.9980933243320684
RMSE: 0.033909
Entropy Value: inf
In [257]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[257]:
feature importance
1 human_development_index 0.587121
0 hospital_beds_per_thousand 0.383466
2 extreme_poverty 0.025717
3 gdp_per_capita 0.003274
4 population_density 0.000422
In [258]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[258]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [259]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [260]:
df_updated
Out[260]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
7310 Finland 1/29/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7311 Finland 1/30/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7312 Finland 1/31/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7313 Finland 2/1/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7314 Finland 2/2/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
... ... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011

2102 rows × 10 columns

In [261]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [262]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [263]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[263]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [264]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [265]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [266]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [267]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [268]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[268]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [269]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [270]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [271]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [272]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9967115293459013
In [273]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [274]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002631359506918055
R2 Score: 0.9977846875396988
RMSE: 0.051297
Entropy Value: 0.0008231147347694647
In [275]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[275]:
feature importance
1 diabetes_prevalence 0.424154
0 cardiovasc_death_rate 0.346029
2 female_smokers 0.087274
5 aged_65_older 0.081840
6 median_age 0.050001
3 male_smokers 0.008944
4 life_expectancy 0.001758
In [276]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[276]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [277]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [278]:
df_updated
Out[278]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
7310 Finland 1/29/2020 3.28 0.938 0.04 40585.721 18.136 0.00000
7311 Finland 1/30/2020 3.28 0.938 0.04 40585.721 18.136 0.00000
7312 Finland 1/31/2020 3.28 0.938 0.04 40585.721 18.136 0.00000
7313 Finland 2/1/2020 3.28 0.938 0.04 40585.721 18.136 0.00000
7314 Finland 2/2/2020 3.28 0.938 0.04 40585.721 18.136 0.00000
... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 0.11011

2102 rows × 8 columns

In [279]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [280]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [281]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[281]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [282]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [283]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [284]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [285]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [286]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[286]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [287]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [288]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [289]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [290]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9968002075493546
In [291]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [292]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0025838187798911654
R2 Score: 0.9978247115518787
RMSE: 0.050831
Entropy Value: 0.0013343036085185639
In [293]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[293]:
feature importance
0 hospital_beds_per_thousand 0.747794
1 human_development_index 0.228597
3 gdp_per_capita 0.010147
2 extreme_poverty 0.009692
4 population_density 0.003770
In [294]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[294]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [295]:
country1 = 'France'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [296]:
df_updated
Out[296]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
8376 France 1/24/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8377 France 1/25/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8378 France 1/26/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8379 France 1/27/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8380 France 1/28/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
... ... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2135 rows × 10 columns

In [297]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [298]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [299]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[299]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [300]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [301]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [302]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [303]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [304]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[304]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [305]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [306]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [307]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [308]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9974380799964881
In [309]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [310]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.04203231904296749
R2 Score: 0.9977496670068304
RMSE: 0.205018
Entropy Value: 0.001029589932983813
In [311]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[311]:
feature importance
1 diabetes_prevalence 0.900030
6 median_age 0.037244
2 female_smokers 0.028587
5 aged_65_older 0.015186
0 cardiovasc_death_rate 0.009428
3 male_smokers 0.006895
4 life_expectancy 0.002631
In [312]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[312]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [313]:
country1 = 'France'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [314]:
df_updated
Out[314]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
8376 France 1/24/2020 5.98 0.901 0.02 38605.671 122.578 0.000000
8377 France 1/25/2020 5.98 0.901 0.02 38605.671 122.578 0.000000
8378 France 1/26/2020 5.98 0.901 0.02 38605.671 122.578 0.000000
8379 France 1/27/2020 5.98 0.901 0.02 38605.671 122.578 0.000000
8380 France 1/28/2020 5.98 0.901 0.02 38605.671 122.578 0.000000
... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.00 35220.084 205.859 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.00 35220.084 205.859 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.00 35220.084 205.859 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.00 35220.084 205.859 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.00 35220.084 205.859 0.735109

2135 rows × 8 columns

In [315]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [316]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [317]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[317]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [318]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [319]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [320]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [321]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [322]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[322]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [323]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [324]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [325]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [326]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.997212894312281
In [327]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [328]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.05589325373552914
R2 Score: 0.9970075780770488
RMSE: 0.236418
Entropy Value: 0.0014961704290713798
In [329]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[329]:
feature importance
1 human_development_index 0.804554
0 hospital_beds_per_thousand 0.079309
4 population_density 0.068506
2 extreme_poverty 0.039535
3 gdp_per_capita 0.008096
In [330]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[330]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [331]:
country1 = 'Netherlands'
country2 = 'Sweden'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [332]:
df_updated
Out[332]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.816005

2100 rows × 10 columns

In [333]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [334]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [335]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[335]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [336]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [337]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [338]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [339]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [340]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[340]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [341]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [342]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [343]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [344]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9974557008389675
In [345]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [346]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009632293525443848
R2 Score: 0.9990787800266875
RMSE: 0.098144
Entropy Value: 0.0005803548472550031
In [347]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[347]:
feature importance
1 diabetes_prevalence 0.788294
2 female_smokers 0.150990
6 median_age 0.023584
0 cardiovasc_death_rate 0.017991
3 male_smokers 0.017966
5 aged_65_older 0.000835
4 life_expectancy 0.000340
In [348]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[348]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [349]:
country1 = 'Netherlands'
country2 = 'Sweden'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [350]:
df_updated
Out[350]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 0.000000
... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.5 46949.283 24.718 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.5 46949.283 24.718 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.5 46949.283 24.718 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.5 46949.283 24.718 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.5 46949.283 24.718 0.816005

2100 rows × 8 columns

In [351]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [352]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [353]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[353]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [354]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [355]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [356]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [357]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [358]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[358]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [359]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [360]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [361]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [362]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990746310006573
In [363]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [364]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01681748604978763
R2 Score: 0.9983915975972862
RMSE: 0.129682
Entropy Value: 0.0005547247075158208
In [365]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[365]:
feature importance
1 human_development_index 0.551596
2 extreme_poverty 0.253784
0 hospital_beds_per_thousand 0.164178
3 gdp_per_capita 0.029412
4 population_density 0.001030
In [366]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[366]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [367]:
country1 = 'Portugal'
country2 = 'Spain'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [368]:
df_updated
Out[368]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148

2097 rows × 10 columns

In [369]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [370]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [371]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[371]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [372]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [373]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [374]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [375]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [376]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[376]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [377]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [378]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [379]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [380]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9991285362384288
In [381]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [382]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.012129239818308182
R2 Score: 0.9977801811586169
RMSE: 0.110133
Entropy Value: 0.0005886831079683492
In [383]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[383]:
feature importance
5 aged_65_older 0.558126
0 cardiovasc_death_rate 0.307971
1 diabetes_prevalence 0.111634
2 female_smokers 0.018149
3 male_smokers 0.003648
6 median_age 0.000360
4 life_expectancy 0.000112
In [384]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[384]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [385]:
country1 = 'Portugal'
country2 = 'Spain'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [386]:
df_updated
Out[386]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 0.000000
... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.97 0.904 1.0 34272.360 93.105 0.855148
25133 Spain 12/26/2022 2.97 0.904 1.0 34272.360 93.105 0.855148
25134 Spain 12/27/2022 2.97 0.904 1.0 34272.360 93.105 0.855148
25135 Spain 12/28/2022 2.97 0.904 1.0 34272.360 93.105 0.855148
25136 Spain 12/29/2022 2.97 0.904 1.0 34272.360 93.105 0.855148

2097 rows × 8 columns

In [387]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [388]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [389]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[389]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [390]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [391]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [392]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [393]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [394]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[394]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [395]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [396]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [397]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [398]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9983792131946579
In [399]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [400]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010138589742890305
R2 Score: 0.9981444976871221
RMSE: 0.100691
Entropy Value: 0.0005635017566477138
In [401]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[401]:
feature importance
1 human_development_index 0.528876
0 hospital_beds_per_thousand 0.437493
2 extreme_poverty 0.029621
3 gdp_per_capita 0.003685
4 population_density 0.000326
In [402]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[402]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [403]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [404]:
df_updated
Out[404]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2091 rows × 10 columns

In [405]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [406]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [407]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[407]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [408]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [409]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [410]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [411]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [412]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[412]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [413]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [414]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [415]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [416]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984789767307959
In [417]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [418]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0027083668308768272
R2 Score: 0.9986712195046868
RMSE: 0.052042
Entropy Value: 0.00047135693736216383
In [419]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[419]:
feature importance
6 median_age 0.593303
1 diabetes_prevalence 0.295471
0 cardiovasc_death_rate 0.053646
5 aged_65_older 0.050607
4 life_expectancy 0.003674
2 female_smokers 0.002363
3 male_smokers 0.000937
In [420]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[420]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [421]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [422]:
df_updated
Out[422]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 0.7 30155.152 113.128 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 0.7 30155.152 113.128 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 0.7 30155.152 113.128 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 0.7 30155.152 113.128 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 0.7 30155.152 113.128 0.000000
... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 0.536669

2091 rows × 8 columns

In [423]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [424]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [425]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[425]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [426]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [427]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [428]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [429]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [430]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[430]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [431]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [432]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [433]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [434]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.998514703095446
In [435]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [436]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005931900948646389
R2 Score: 0.9970896873382034
RMSE: 0.077019
Entropy Value: 0.0007898893113525947
In [437]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[437]:
feature importance
1 human_development_index 0.825169
0 hospital_beds_per_thousand 0.143681
2 extreme_poverty 0.022433
3 gdp_per_capita 0.006149
4 population_density 0.002567
In [438]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[438]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [439]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [440]:
df_updated
Out[440]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 10 columns

In [441]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [442]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [443]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[443]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [444]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [445]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [446]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [447]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [448]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[448]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [449]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [450]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [451]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [452]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9573756200168303
In [453]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [454]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  3.067522668688002
R2 Score: 0.8735128665771991
RMSE: 1.751434
Entropy Value: 0.00997828364305316
In [455]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[455]:
feature importance
0 cardiovasc_death_rate 0.515435
1 diabetes_prevalence 0.265632
2 female_smokers 0.128937
5 aged_65_older 0.040833
6 median_age 0.026848
4 life_expectancy 0.013474
3 male_smokers 0.008841
In [456]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[456]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [457]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [458]:
df_updated
Out[458]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.2 39753.244 272.898 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.2 39753.244 272.898 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.2 39753.244 272.898 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.2 39753.244 272.898 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.2 39753.244 272.898 22.222222
... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 1.084791

2136 rows × 8 columns

In [459]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [460]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [461]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[461]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [462]:
# Setting the number of principal components to 3 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [463]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density'])
principal_df['location'] = df_updated['location'].values
In [464]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [465]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [466]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[466]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [467]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [468]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [469]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [470]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9555599032367461
In [471]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [472]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  1.3831340557751377
R2 Score: 0.9429674428683925
RMSE: 1.176067
Entropy Value: 0.007189751715215273
In [473]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[473]:
feature importance
1 human_development_index 0.471024
0 hospital_beds_per_thousand 0.373521
2 extreme_poverty 0.081580
4 population_density 0.049132
3 gdp_per_capita 0.024743
In [7]:
# Country Pair by Pair Analysis relative to cardiovascular death rate
In [8]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[8]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [9]:
# Showing the pairings of countries based on cardiovascular death rate (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]

df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]

df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]

df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]

df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]

df_Portugal = df[(df.location == "Portugal")]
df_Spain = df[(df.location == "Spain")]

df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]

df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]

df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Bulgaria = df[(df.location == "Bulgaria")]
df_Latvia = df[(df.location == "Latvia")]

df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
In [10]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [11]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [12]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[12]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [480]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [481]:
df_updated
Out[481]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2095 Belgium 12/26/2022 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2096 Belgium 12/27/2022 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2097 Belgium 12/28/2022 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2098 Belgium 12/29/2022 4.29 25.1 31.4 81.63 18.571 41.8 0.711787

2099 rows × 9 columns

In [482]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [483]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [484]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[484]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [485]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [486]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [487]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [488]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [489]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[489]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [490]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [491]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [492]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [493]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.998456196460482
In [494]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [495]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.016140462125189346
R2 Score: 0.9986367589447218
RMSE: 0.127045
Entropy Value: 0.0007186737813811423
In [496]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[496]:
feature importance
0 diabetes_prevalence 0.571618
1 female_smokers 0.179058
2 male_smokers 0.150706
5 median_age 0.080402
3 life_expectancy 0.018095
4 aged_65_older 0.000120
In [497]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[497]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [498]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [499]:
df_updated
Out[499]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2095 Belgium 12/26/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2096 Belgium 12/27/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2097 Belgium 12/28/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2098 Belgium 12/29/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787

2099 rows × 9 columns

In [500]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [501]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [502]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[502]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [503]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [504]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [505]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [506]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [507]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[507]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [508]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [509]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [510]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [511]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985453142557521
In [512]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [513]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01731215895223598
R2 Score: 0.9985377961513036
RMSE: 0.131576
Entropy Value: 0.000755209259742723
In [514]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[514]:
feature importance
1 human_development_index 0.604442
5 population 0.227045
0 hospital_beds_per_thousand 0.118292
2 extreme_poverty 0.041327
3 gdp_per_capita 0.008604
4 population_density 0.000290
In [515]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[515]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [516]:
country1 = 'Canada'
country2 = 'Cyprus'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [517]:
df_updated
Out[517]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2099 rows × 9 columns

In [518]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [519]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [520]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[520]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [521]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [522]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [523]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [524]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [525]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[525]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [526]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [527]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [528]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [529]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9989867496016483
In [530]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [531]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0028020652544185898
R2 Score: 0.9991762074874513
RMSE: 0.052935
Entropy Value: 0.0003290685489915789
In [532]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[532]:
feature importance
0 diabetes_prevalence 0.444854
1 female_smokers 0.370653
5 median_age 0.153361
2 male_smokers 0.025670
3 life_expectancy 0.004770
4 aged_65_older 0.000692
In [533]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[533]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [534]:
country1 = 'Canada'
country2 = 'Cyprus'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [535]:
df_updated
Out[535]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.093162

2099 rows × 9 columns

In [536]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [537]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [538]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[538]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [539]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [540]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [541]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [542]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [543]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[543]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [544]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [545]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [546]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [547]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989605271130995
In [548]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [549]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002685282233557505
R2 Score: 0.999210541084082
RMSE: 0.051820
Entropy Value: 0.00034527877599041973
In [550]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[550]:
feature importance
5 population 0.655426
1 human_development_index 0.198181
0 hospital_beds_per_thousand 0.115957
2 extreme_poverty 0.020517
4 population_density 0.006514
3 gdp_per_capita 0.003405
In [551]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[551]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [552]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [553]:
df_updated
Out[553]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 6.41 19.3 18.8 80.90 19.677 42.3 0.00000
5188 Denmark 2/3/2020 6.41 19.3 18.8 80.90 19.677 42.3 0.00000
5189 Denmark 2/4/2020 6.41 19.3 18.8 80.90 19.677 42.3 0.00000
5190 Denmark 2/5/2020 6.41 19.3 18.8 80.90 19.677 42.3 0.00000
5191 Denmark 2/6/2020 6.41 19.3 18.8 80.90 19.677 42.3 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8372 Finland 12/26/2022 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8373 Finland 12/27/2022 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8374 Finland 12/28/2022 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8375 Finland 12/29/2022 5.76 18.3 22.6 81.91 21.228 42.8 0.55159

2128 rows × 9 columns

In [554]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [555]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [556]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[556]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [557]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [558]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [559]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [560]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [561]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[561]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [562]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [563]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [564]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [565]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987252225958365
In [566]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [567]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010232434979697452
R2 Score: 0.9941953413231738
RMSE: 0.101155
Entropy Value: 0.0021676176101701077
In [568]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[568]:
feature importance
0 diabetes_prevalence 0.491157
1 female_smokers 0.246916
2 male_smokers 0.154249
5 median_age 0.058833
3 life_expectancy 0.048338
4 aged_65_older 0.000507
In [569]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[569]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [570]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [571]:
df_updated
Out[571]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5188 Denmark 2/3/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5189 Denmark 2/4/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5190 Denmark 2/5/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5191 Denmark 2/6/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8372 Finland 12/26/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8373 Finland 12/27/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8374 Finland 12/28/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8375 Finland 12/29/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159

2128 rows × 9 columns

In [572]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [573]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [574]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[574]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [575]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [576]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [577]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [578]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [579]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[579]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [580]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [581]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [582]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [583]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984953654596765
In [584]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [585]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008136863131264145
R2 Score: 0.9953841179278682
RMSE: 0.090205
Entropy Value: 0.0015398143907589235
In [586]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[586]:
feature importance
1 human_development_index 0.753936
5 population 0.092143
0 hospital_beds_per_thousand 0.070220
2 extreme_poverty 0.061815
3 gdp_per_capita 0.020882
4 population_density 0.001005
In [587]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[587]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [588]:
country1 = 'France'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [589]:
df_updated
Out[589]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
8376 France 1/24/2020 4.77 30.1 35.6 82.66 19.718 42.0 0.00000
8377 France 1/25/2020 4.77 30.1 35.6 82.66 19.718 42.0 0.00000
8378 France 1/26/2020 4.77 30.1 35.6 82.66 19.718 42.0 0.00000
8379 France 1/27/2020 4.77 30.1 35.6 82.66 19.718 42.0 0.00000
8380 France 1/28/2020 4.77 30.1 35.6 82.66 19.718 42.0 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 5.31 14.3 15.2 82.99 14.431 37.3 0.11011

2107 rows × 9 columns

In [590]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [591]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [592]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[592]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [593]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [594]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [595]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [596]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [597]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[597]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [598]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [599]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [600]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [601]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9946592139246286
In [602]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [603]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.053596386319007226
R2 Score: 0.9957380613117373
RMSE: 0.231509
Entropy Value: 0.0014569645741554262
In [604]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[604]:
feature importance
5 median_age 0.568274
0 diabetes_prevalence 0.232028
1 female_smokers 0.185377
2 male_smokers 0.007281
3 life_expectancy 0.004830
4 aged_65_older 0.002209
In [605]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[605]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [606]:
country1 = 'France'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [607]:
df_updated
Out[607]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
8376 France 1/24/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8377 France 1/25/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8378 France 1/26/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8379 France 1/27/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8380 France 1/28/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011

2107 rows × 9 columns

In [608]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [609]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [610]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[610]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [611]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [612]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [613]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [614]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [615]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[615]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [616]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [617]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [618]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [619]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9946715554617531
In [620]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [621]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.06349754860870421
R2 Score: 0.9949507293753999
RMSE: 0.251987
Entropy Value: 0.002273841175109527
In [622]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[622]:
feature importance
1 human_development_index 0.739557
0 hospital_beds_per_thousand 0.133697
4 population_density 0.060550
5 population 0.031155
3 gdp_per_capita 0.017913
2 extreme_poverty 0.017128
In [623]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[623]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [624]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [625]:
df_updated
Out[625]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18839 Ireland 3/1/2020 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18840 Ireland 3/2/2020 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18841 Ireland 3/3/2020 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18842 Ireland 3/4/2020 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2099 rows × 9 columns

In [626]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [627]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [628]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[628]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [629]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [630]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [631]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [632]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [633]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[633]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [634]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [635]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [636]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [637]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989070959122885
In [638]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [639]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005371809550427108
R2 Score: 0.9995586936801555
RMSE: 0.073293
Entropy Value: 0.00023323823646080252
In [640]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[640]:
feature importance
5 median_age 0.667548
0 diabetes_prevalence 0.180913
1 female_smokers 0.142373
2 male_smokers 0.008080
3 life_expectancy 0.000833
4 aged_65_older 0.000253
In [641]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[641]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [642]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [643]:
df_updated
Out[643]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
18838 Ireland 2/29/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18839 Ireland 3/1/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18840 Ireland 3/2/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18841 Ireland 3/3/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18842 Ireland 3/4/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109

2099 rows × 9 columns

In [644]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [645]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [646]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[646]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [647]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [648]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [649]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [650]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [651]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[651]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [652]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [653]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [654]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [655]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988984971748192
In [656]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [657]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009889496543689754
R2 Score: 0.9991875554626719
RMSE: 0.099446
Entropy Value: 0.00044270709478587373
In [658]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[658]:
feature importance
1 human_development_index 0.702587
0 hospital_beds_per_thousand 0.200820
5 population 0.042593
2 extreme_poverty 0.026734
3 gdp_per_capita 0.018469
4 population_density 0.008797
In [659]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[659]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [660]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [661]:
df_updated
Out[661]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17796 Luxembourg 12/26/2022 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17797 Luxembourg 12/27/2022 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17798 Luxembourg 12/28/2022 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17799 Luxembourg 12/29/2022 4.42 20.9 26.0 82.25 14.312 39.7 0.377872

2078 rows × 9 columns

In [662]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [663]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [664]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[664]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [665]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [666]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [667]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [668]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [669]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[669]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [670]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [671]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [672]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [673]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9989525073226527
In [674]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [675]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004874464208163757
R2 Score: 0.9993646974648677
RMSE: 0.069817
Entropy Value: 0.0008434937353807828
In [676]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[676]:
feature importance
5 median_age 0.811714
0 diabetes_prevalence 0.116426
2 male_smokers 0.038479
1 female_smokers 0.030675
3 life_expectancy 0.002520
4 aged_65_older 0.000185
In [677]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[677]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [678]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [679]:
df_updated
Out[679]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17796 Luxembourg 12/26/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17797 Luxembourg 12/27/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17798 Luxembourg 12/28/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17799 Luxembourg 12/29/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872

2078 rows × 9 columns

In [680]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [681]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [682]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[682]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [683]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [684]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [685]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [686]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [687]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[687]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [688]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [689]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [690]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [691]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987243093178199
In [692]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [693]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007716113263742701
R2 Score: 0.9989943374064345
RMSE: 0.087841
Entropy Value: 0.0017161933932385992
In [694]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[694]:
feature importance
1 human_development_index 0.482469
2 extreme_poverty 0.244199
0 hospital_beds_per_thousand 0.199908
5 population 0.057787
3 gdp_per_capita 0.014620
4 population_density 0.001017
In [695]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[695]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [696]:
country1 = 'Portugal'
country2 = 'Spain'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [697]:
df_updated
Out[697]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 7.17 27.4 31.4 83.56 19.436 45.5 0.855148

2097 rows × 9 columns

In [698]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [699]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [700]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[700]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [701]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [702]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [703]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [704]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [705]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[705]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [706]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [707]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [708]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [709]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987444044994997
In [710]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [711]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006938078296368985
R2 Score: 0.9987302355995943
RMSE: 0.083295
Entropy Value: 0.0004794443988216877
In [712]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[712]:
feature importance
0 diabetes_prevalence 0.444681
1 female_smokers 0.434714
5 median_age 0.093224
2 male_smokers 0.015735
3 life_expectancy 0.011364
4 aged_65_older 0.000283
In [713]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[713]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [714]:
country1 = 'Portugal'
country2 = 'Spain'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [715]:
df_updated
Out[715]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148

2097 rows × 9 columns

In [716]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [717]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [718]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[718]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [719]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [720]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [721]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [722]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [723]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[723]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [724]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [725]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [726]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [727]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982848456743826
In [728]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [729]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.011554150205913471
R2 Score: 0.9978854305210006
RMSE: 0.107490
Entropy Value: 0.0006584258574244559
In [730]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[730]:
feature importance
1 human_development_index 0.704819
5 population 0.214352
0 hospital_beds_per_thousand 0.044421
2 extreme_poverty 0.030265
3 gdp_per_capita 0.005652
4 population_density 0.000491
In [731]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[731]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [732]:
country1 = 'Sweden'
country2 = 'Switzerland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [733]:
df_updated
Out[733]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 4.79 18.8 18.9 82.80 19.985 41.0 0.816005

2102 rows × 9 columns

In [734]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [735]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [736]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[736]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [737]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [738]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [739]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [740]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [741]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[741]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [742]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [743]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [744]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [745]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984333716901233
In [746]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [747]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.015931239549295276
R2 Score: 0.9969563630033106
RMSE: 0.126219
Entropy Value: 0.0006754021628688348
In [748]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[748]:
feature importance
1 female_smokers 0.775295
0 diabetes_prevalence 0.148899
3 life_expectancy 0.030343
2 male_smokers 0.025867
5 median_age 0.019459
4 aged_65_older 0.000137
In [749]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[749]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [750]:
country1 = 'Sweden'
country2 = 'Switzerland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [751]:
df_updated
Out[751]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.816005

2102 rows × 9 columns

In [752]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [753]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [754]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[754]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [755]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [756]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [757]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [758]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [759]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[759]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [760]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [761]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [762]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [763]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985258910599774
In [764]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [765]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009780822718122402
R2 Score: 0.9981313899781104
RMSE: 0.098898
Entropy Value: 0.0007038834783452404
In [766]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[766]:
feature importance
1 human_development_index 0.543966
5 population 0.270168
0 hospital_beds_per_thousand 0.096949
3 gdp_per_capita 0.051447
2 extreme_poverty 0.037239
4 population_density 0.000231
In [767]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[767]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [13]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [14]:
df_updated
Out[14]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 4.28 20.0 24.7 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 4.28 20.0 24.7 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 4.28 20.0 24.7 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 9 columns

In [15]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [16]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [17]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[17]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [18]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [19]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [20]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [21]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [22]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[22]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [23]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [24]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [25]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [26]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9572922428501809
In [27]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [28]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  1.3971853498164353
R2 Score: 0.9423880476703369
RMSE: 1.182026
Entropy Value: 0.006984034800282115
In [29]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[29]:
feature importance
0 diabetes_prevalence 0.400049
5 median_age 0.369858
2 male_smokers 0.116165
4 aged_65_older 0.060910
1 female_smokers 0.044668
3 life_expectancy 0.008349
In [30]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[30]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [31]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [32]:
df_updated
Out[32]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.2 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.2 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.2 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791

2136 rows × 9 columns

In [33]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [34]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [35]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[35]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [36]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [37]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [38]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [39]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [40]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[40]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [41]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [42]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [43]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [44]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9566213518011253
In [45]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [46]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.713063858863409
R2 Score: 0.9705973147727024
RMSE: 0.844431
Entropy Value: 0.008268179051644176
In [47]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[47]:
feature importance
0 hospital_beds_per_thousand 0.302760
1 human_development_index 0.227340
5 population 0.207927
4 population_density 0.146404
2 extreme_poverty 0.077408
3 gdp_per_capita 0.038161
In [803]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[803]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [804]:
country1 = 'Czechia'
country2 = 'Estonia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [805]:
df_updated
Out[805]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
4153 Czechia 3/1/2020 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4154 Czechia 3/2/2020 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4155 Czechia 3/3/2020 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4156 Czechia 3/4/2020 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4157 Czechia 3/5/2020 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 4.02 24.5 39.3 78.74 19.452 42.7 0.464100
7306 Estonia 12/26/2022 4.02 24.5 39.3 78.74 19.452 42.7 0.464100
7307 Estonia 12/27/2022 4.02 24.5 39.3 78.74 19.452 42.7 0.463645
7308 Estonia 12/28/2022 4.02 24.5 39.3 78.74 19.452 42.7 0.466423
7309 Estonia 12/29/2022 4.02 24.5 39.3 78.74 19.452 42.7 0.466423

2095 rows × 9 columns

In [806]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [807]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [808]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[808]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [809]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [810]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [811]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [812]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [813]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[813]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [814]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [815]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [816]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [817]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987100560424087
In [818]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [819]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0010058615170550663
R2 Score: 0.9985394731187587
RMSE: 0.031715
Entropy Value: 0.0002910800367932633
In [820]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[820]:
feature importance
1 female_smokers 0.747054
0 diabetes_prevalence 0.135447
5 median_age 0.081459
2 male_smokers 0.029858
3 life_expectancy 0.005972
4 aged_65_older 0.000209
In [821]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[821]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [822]:
country1 = 'Czechia'
country2 = 'Estonia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [823]:
df_updated
Out[823]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
4153 Czechia 3/1/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4154 Czechia 3/2/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4155 Czechia 3/3/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4156 Czechia 3/4/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4157 Czechia 3/5/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7306 Estonia 12/26/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7307 Estonia 12/27/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.463645
7308 Estonia 12/28/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423
7309 Estonia 12/29/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423

2095 rows × 9 columns

In [824]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [825]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [826]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[826]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [827]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [828]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [829]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [830]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [831]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[831]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [832]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [833]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [834]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [835]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982992559865769
In [836]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [837]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0013297376906791
R2 Score: 0.9980691997762051
RMSE: 0.036466
Entropy Value: inf
In [838]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[838]:
feature importance
1 human_development_index 0.691546
0 hospital_beds_per_thousand 0.132334
5 population 0.127958
2 extreme_poverty 0.030192
3 gdp_per_capita 0.017370
4 population_density 0.000600
In [839]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[839]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [840]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [841]:
df_updated
Out[841]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2091 rows × 9 columns

In [842]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [843]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [844]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[844]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [845]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [846]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [847]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [848]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [849]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[849]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [850]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [851]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [852]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [853]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9984547895987792
In [854]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [855]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002902405614533229
R2 Score: 0.9985760200848308
RMSE: 0.053874
Entropy Value: 0.0005018366914602828
In [856]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[856]:
feature importance
1 female_smokers 0.778330
0 diabetes_prevalence 0.172858
5 median_age 0.023924
2 male_smokers 0.016332
3 life_expectancy 0.004305
4 aged_65_older 0.004251
In [857]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[857]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [858]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [859]:
df_updated
Out[859]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.536669

2091 rows × 9 columns

In [860]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [861]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [862]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[862]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [863]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [864]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [865]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [866]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [867]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[867]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [868]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [869]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [870]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [871]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979685572320796
In [872]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [873]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010286891805743988
R2 Score: 0.994953039214247
RMSE: 0.101424
Entropy Value: 0.001005217028297138
In [874]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[874]:
feature importance
1 human_development_index 0.658730
5 population 0.288222
2 extreme_poverty 0.024648
0 hospital_beds_per_thousand 0.015281
3 gdp_per_capita 0.007035
4 population_density 0.006085
In [875]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[875]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [876]:
country1 = 'Bulgaria'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [877]:
df_updated
Out[877]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 4.91 25.6 51.0 75.29 19.754 43.9 0.631969

2065 rows × 9 columns

In [878]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [879]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [880]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[880]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [881]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [882]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [883]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [884]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [885]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[885]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [886]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [887]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [888]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [889]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9581157049903706
In [890]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [891]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0020134595466409933
R2 Score: 0.9988351175179158
RMSE: 0.044872
Entropy Value: 0.0002481296115330379
In [892]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[892]:
feature importance
5 median_age 0.453982
0 diabetes_prevalence 0.442633
2 male_smokers 0.052401
1 female_smokers 0.027709
4 aged_65_older 0.016088
3 life_expectancy 0.007187
In [893]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[893]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [894]:
country1 = 'Bulgaria'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [895]:
df_updated
Out[895]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631969

2065 rows × 9 columns

In [896]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [897]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [898]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[898]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [899]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [900]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [901]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [902]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [903]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[903]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [904]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [905]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [906]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [907]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9564879049798748
In [908]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [909]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0038412225079292634
R2 Score: 0.9977776693767008
RMSE: 0.061978
Entropy Value: 0.0004044101196355003
In [910]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[910]:
feature importance
0 hospital_beds_per_thousand 0.829882
5 population 0.121940
2 extreme_poverty 0.019138
1 human_development_index 0.016068
4 population_density 0.008275
3 gdp_per_capita 0.004696
In [911]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[911]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [912]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [913]:
df_updated
Out[913]:
location date diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
15721 Serbia 2/26/2020 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15722 Serbia 2/27/2020 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15723 Serbia 2/28/2020 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15724 Serbia 2/29/2020 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15725 Serbia 3/1/2020 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 9.74 22.9 37.1 76.05 17.850 43.0 2.037520
18834 Romania 12/26/2022 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18835 Romania 12/27/2022 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18836 Romania 12/28/2022 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18837 Romania 12/29/2022 9.74 22.9 37.1 76.05 17.850 43.0 2.036403

2076 rows × 9 columns

In [914]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [915]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [916]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[916]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [917]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [918]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [919]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [920]:
selected_cols = ['diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [921]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[921]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [922]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [923]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [924]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [925]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982335652486138
In [926]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [927]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0015563262679148646
R2 Score: 0.9990938937518747
RMSE: 0.039450
Entropy Value: 0.0004499334264955072
In [928]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[928]:
feature importance
0 diabetes_prevalence 0.470175
5 median_age 0.305786
1 female_smokers 0.213590
2 male_smokers 0.007235
3 life_expectancy 0.002493
4 aged_65_older 0.000721
In [929]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[929]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [930]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [931]:
df_updated
Out[931]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
15721 Serbia 2/26/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15722 Serbia 2/27/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15723 Serbia 2/28/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15724 Serbia 2/29/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15725 Serbia 3/1/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403

2076 rows × 9 columns

In [932]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [933]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [934]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[934]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [935]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [936]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [937]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [938]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [939]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[939]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [940]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [941]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [942]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [943]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979474526774726
In [944]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [945]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002076185405706244
R2 Score: 0.9987912273877525
RMSE: 0.045565
Entropy Value: 0.00045853981129780964
In [946]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[946]:
feature importance
5 population 0.663901
0 hospital_beds_per_thousand 0.186232
1 human_development_index 0.124847
2 extreme_poverty 0.013139
3 gdp_per_capita 0.011260
4 population_density 0.000621
In [3]:
# Country Pair by Pair Analysis relative to male smokers
In [4]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[4]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [5]:
# Showing the pairings of countries based on male smokers (13 pairs of countries)
df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]

df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]

df_UnitedStates = df[(df.location == "United States")]
df_Austria = df[(df.location == "Austria")]

df_Belgium = df[(df.location == "Belgium")]
df_Czechia = df[(df.location == "Czechia")]

df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]

df_Italy = df[(df.location == "Italy")]
df_Portugal = df[(df.location == "Portugal")]

df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]

df_Slovakia = df[(df.location == "Slovakia")]
df_Spain = df[(df.location == "Spain")]

df_Switzerland = df[(df.location == "Switzerland")]
df_Bulgaria = df[(df.location == "Bulgaria")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Latvia = df[(df.location == "Latvia")]
In [6]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [7]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [8]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[8]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [9]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [10]:
df_updated
Out[10]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 19.3 80.90 19.677 42.3 0.000000
5188 Denmark 2/3/2020 114.767 6.41 19.3 80.90 19.677 42.3 0.000000
5189 Denmark 2/4/2020 114.767 6.41 19.3 80.90 19.677 42.3 0.000000
5190 Denmark 2/5/2020 114.767 6.41 19.3 80.90 19.677 42.3 0.000000
5191 Denmark 2/6/2020 114.767 6.41 19.3 80.90 19.677 42.3 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 82.43 16.984 41.4 1.093162

2134 rows × 9 columns

In [11]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [12]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [13]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[13]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [14]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [15]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [16]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [17]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [18]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[18]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [19]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [20]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [21]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [22]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991198472250268
In [23]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [24]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0035622981531189114
R2 Score: 0.999149818574882
RMSE: 0.059685
Entropy Value: 0.000330791989058651
In [25]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[25]:
feature importance
1 diabetes_prevalence 0.743513
0 cardiovasc_death_rate 0.166964
5 median_age 0.068076
2 female_smokers 0.017976
3 life_expectancy 0.003401
4 aged_65_older 0.000071
In [26]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[26]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [27]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [28]:
df_updated
Out[28]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.5 0.940 0.2 46682.515 136.520 5882259 0.000000
5188 Denmark 2/3/2020 2.5 0.940 0.2 46682.515 136.520 5882259 0.000000
5189 Denmark 2/4/2020 2.5 0.940 0.2 46682.515 136.520 5882259 0.000000
5190 Denmark 2/5/2020 2.5 0.940 0.2 46682.515 136.520 5882259 0.000000
5191 Denmark 2/6/2020 2.5 0.940 0.2 46682.515 136.520 5882259 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.929 0.5 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.929 0.5 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.929 0.5 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.929 0.5 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.929 0.5 44017.591 4.037 38454328 1.093162

2134 rows × 9 columns

In [29]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [30]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [31]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[31]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [32]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [33]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [34]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [35]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [36]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[36]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [37]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [38]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [39]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [40]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9987973362067224
In [41]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [42]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0034098009007722373
R2 Score: 0.9991862137124461
RMSE: 0.058394
Entropy Value: 0.00042828048166675814
In [43]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[43]:
feature importance
1 human_development_index 0.565996
0 hospital_beds_per_thousand 0.211338
3 gdp_per_capita 0.083368
5 population 0.079615
2 extreme_poverty 0.059425
4 population_density 0.000259
In [44]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[44]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [45]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [46]:
df_updated
Out[46]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
7310 Finland 1/29/2020 153.507 5.76 18.3 81.91 21.228 42.8 0.00000
7311 Finland 1/30/2020 153.507 5.76 18.3 81.91 21.228 42.8 0.00000
7312 Finland 1/31/2020 153.507 5.76 18.3 81.91 21.228 42.8 0.00000
7313 Finland 2/1/2020 153.507 5.76 18.3 81.91 21.228 42.8 0.00000
7314 Finland 2/2/2020 153.507 5.76 18.3 81.91 21.228 42.8 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 82.99 14.431 37.3 0.11011

2102 rows × 9 columns

In [47]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [48]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [49]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[49]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [50]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [51]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [52]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [53]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [54]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[54]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [55]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [56]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [57]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [58]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9976265506566767
In [59]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [60]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002104558864221546
R2 Score: 0.9982281951732214
RMSE: 0.045875
Entropy Value: 0.0006809600254822203
In [61]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[61]:
feature importance
1 diabetes_prevalence 0.566317
0 cardiovasc_death_rate 0.374238
5 median_age 0.048315
2 female_smokers 0.006486
3 life_expectancy 0.003613
4 aged_65_older 0.001031
In [62]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[62]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [63]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [64]:
df_updated
Out[64]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
7310 Finland 1/29/2020 3.28 0.938 0.04 40585.721 18.136 5540745 0.00000
7311 Finland 1/30/2020 3.28 0.938 0.04 40585.721 18.136 5540745 0.00000
7312 Finland 1/31/2020 3.28 0.938 0.04 40585.721 18.136 5540745 0.00000
7313 Finland 2/1/2020 3.28 0.938 0.04 40585.721 18.136 5540745 0.00000
7314 Finland 2/2/2020 3.28 0.938 0.04 40585.721 18.136 5540745 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011

2102 rows × 9 columns

In [65]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [66]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [67]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[67]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [68]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [69]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [70]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [71]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [72]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[72]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [73]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [74]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [75]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [76]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9972033148176618
In [77]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [78]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004836890703946099
R2 Score: 0.9959278752229046
RMSE: 0.069548
Entropy Value: 0.0012304204035138641
In [79]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[79]:
feature importance
1 human_development_index 0.327785
0 hospital_beds_per_thousand 0.249805
2 extreme_poverty 0.231512
5 population 0.119151
3 gdp_per_capita 0.062924
4 population_density 0.008824
In [80]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[80]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [81]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [82]:
df_updated
Out[82]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
16759 Luxembourg 2/12/2020 128.275 4.42 20.9 82.25 14.312 39.7 0.000000
16760 Luxembourg 2/24/2020 128.275 4.42 20.9 82.25 14.312 39.7 0.000000
16761 Luxembourg 2/25/2020 128.275 4.42 20.9 82.25 14.312 39.7 0.000000
16762 Luxembourg 2/26/2020 128.275 4.42 20.9 82.25 14.312 39.7 0.000000
16763 Luxembourg 2/27/2020 128.275 4.42 20.9 82.25 14.312 39.7 0.000000
... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 126.459 3.28 23.0 82.30 13.928 38.7 0.491388
19869 Ireland 12/26/2022 126.459 3.28 23.0 82.30 13.928 38.7 0.491388
19870 Ireland 12/27/2022 126.459 3.28 23.0 82.30 13.928 38.7 0.491388
19871 Ireland 12/28/2022 126.459 3.28 23.0 82.30 13.928 38.7 0.491388
19872 Ireland 12/29/2022 126.459 3.28 23.0 82.30 13.928 38.7 0.491388

2076 rows × 9 columns

In [83]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [84]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [85]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[85]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [86]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [87]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [88]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [89]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [90]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[90]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [91]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [92]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [93]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [94]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987986251987184
In [95]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [96]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0017173908797679875
R2 Score: 0.9992479140176413
RMSE: 0.041441
Entropy Value: 0.00039280120442968804
In [97]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[97]:
feature importance
5 median_age 0.765400
0 cardiovasc_death_rate 0.195268
1 diabetes_prevalence 0.026928
2 female_smokers 0.010463
3 life_expectancy 0.001324
4 aged_65_older 0.000616
In [98]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[98]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [99]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [100]:
df_updated
Out[100]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
16759 Luxembourg 2/12/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19869 Ireland 12/26/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19870 Ireland 12/27/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19871 Ireland 12/28/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19872 Ireland 12/29/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388

2076 rows × 9 columns

In [101]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [102]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [103]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[103]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [104]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [105]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [106]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [107]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [108]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[108]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [109]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [110]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [111]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [112]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986051830769147
In [113]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [114]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0015122598797065847
R2 Score: 0.9993377457219498
RMSE: 0.038888
Entropy Value: 0.0003022118010805689
In [115]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[115]:
feature importance
5 population 0.715384
1 human_development_index 0.205967
0 hospital_beds_per_thousand 0.052834
2 extreme_poverty 0.024062
3 gdp_per_capita 0.001458
4 population_density 0.000295
In [116]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[116]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [117]:
country1 = 'Netherlands'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [118]:
df_updated
Out[118]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 81.32 19.062 44.5 0.536669

2099 rows × 9 columns

In [119]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [120]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [121]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[121]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [122]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [123]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [124]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [125]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [126]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[126]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [127]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [128]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [129]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [130]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992962940278153
In [131]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [132]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006026879845882446
R2 Score: 0.9992643953405248
RMSE: 0.077633
Entropy Value: 0.0003719429700690147
In [133]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[133]:
feature importance
1 diabetes_prevalence 0.933080
0 cardiovasc_death_rate 0.038983
5 median_age 0.013890
2 female_smokers 0.012534
3 life_expectancy 0.001336
4 aged_65_older 0.000178
In [134]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[134]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [135]:
country1 = 'Netherlands'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [136]:
df_updated
Out[136]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.536669

2099 rows × 9 columns

In [137]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [138]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [139]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[139]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [140]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [141]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [142]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [143]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [144]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[144]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [145]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [146]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [147]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [148]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991623229575961
In [149]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [150]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0044516891807437255
R2 Score: 0.9994566536271454
RMSE: 0.066721
Entropy Value: 0.00032994017253004367
In [151]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[151]:
feature importance
1 human_development_index 0.874638
2 extreme_poverty 0.051845
0 hospital_beds_per_thousand 0.046778
5 population 0.018835
3 gdp_per_capita 0.006703
4 population_density 0.001201
In [152]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[152]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [153]:
country1 = 'Sweden'
country2 = 'United Kingdom'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [154]:
df_updated
Out[154]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 82.80 19.985 41.0 0.816005

2126 rows × 9 columns

In [155]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [156]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [157]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[157]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [158]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [159]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [160]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [161]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [162]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[162]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [163]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [164]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [165]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [166]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9588473335928803
In [167]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [168]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.47227318644134914
R2 Score: 0.9826662059026536
RMSE: 0.687221
Entropy Value: 0.0030797153679582856
In [169]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[169]:
feature importance
0 cardiovasc_death_rate 0.414115
5 median_age 0.352922
2 female_smokers 0.062994
1 diabetes_prevalence 0.060655
3 life_expectancy 0.059176
4 aged_65_older 0.050139
In [170]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[170]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [171]:
country1 = 'Sweden'
country2 = 'United Kingdom'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [172]:
df_updated
Out[172]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.2 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.2 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.2 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.816005

2126 rows × 9 columns

In [173]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [174]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [175]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[175]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [176]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [177]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [178]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [179]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [180]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[180]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [181]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [182]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [183]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [184]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9546833804604962
In [185]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [186]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.997926932618917
R2 Score: 0.9633731906218204
RMSE: 0.998963
Entropy Value: 0.005106077515730719
In [187]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[187]:
feature importance
1 human_development_index 0.576604
2 extreme_poverty 0.127571
5 population 0.122504
0 hospital_beds_per_thousand 0.113870
4 population_density 0.046708
3 gdp_per_capita 0.012743
In [188]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[188]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [189]:
country1 = 'United States'
country2 = 'Austria'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [190]:
df_updated
Out[190]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 78.86 15.413 38.3 1.084791

2112 rows × 9 columns

In [191]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [192]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [193]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[193]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [194]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [195]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [196]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [197]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [198]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[198]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [199]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [200]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [201]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [202]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.983794331453694
In [203]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [204]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00854869651988359
R2 Score: 0.9945310004412267
RMSE: 0.092459
Entropy Value: 0.0006678574833760092
In [205]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[205]:
feature importance
1 diabetes_prevalence 0.631029
5 median_age 0.184170
0 cardiovasc_death_rate 0.107669
2 female_smokers 0.031789
4 aged_65_older 0.031600
3 life_expectancy 0.013742
In [206]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[206]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [207]:
country1 = 'United States'
country2 = 'Austria'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [208]:
df_updated
Out[208]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791

2112 rows × 9 columns

In [209]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [210]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [211]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[211]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [212]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [213]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [214]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [215]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [216]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[216]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [217]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [218]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [219]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [220]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9887577648734872
In [221]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [222]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00406836830426948
R2 Score: 0.9973972751975432
RMSE: 0.063784
Entropy Value: 0.0005141467774508717
In [223]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[223]:
feature importance
1 human_development_index 0.670685
5 population 0.231413
4 population_density 0.042005
2 extreme_poverty 0.032527
3 gdp_per_capita 0.019548
0 hospital_beds_per_thousand 0.003822
In [224]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[224]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [225]:
country1 = 'Belgium'
country2 = 'Czechia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [226]:
df_updated
Out[226]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 25.1 81.63 18.571 41.8 0.000000
1040 Belgium 2/5/2020 114.898 4.29 25.1 81.63 18.571 41.8 0.000000
1041 Belgium 2/6/2020 114.898 4.29 25.1 81.63 18.571 41.8 0.000000
1042 Belgium 2/7/2020 114.898 4.29 25.1 81.63 18.571 41.8 0.000000
1043 Belgium 2/8/2020 114.898 4.29 25.1 81.63 18.571 41.8 0.000000
... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 227.485 6.82 30.5 79.38 19.027 43.3 0.919258
5183 Czechia 12/26/2022 227.485 6.82 30.5 79.38 19.027 43.3 0.919368
5184 Czechia 12/27/2022 227.485 6.82 30.5 79.38 19.027 43.3 0.919431
5185 Czechia 12/28/2022 227.485 6.82 30.5 79.38 19.027 43.3 0.919430
5186 Czechia 12/29/2022 227.485 6.82 30.5 79.38 19.027 43.3 0.919575

2094 rows × 9 columns

In [227]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [228]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [229]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[229]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [230]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [231]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [232]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [233]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [234]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[234]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [235]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [236]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [237]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [238]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9987378590313624
In [239]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [240]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.012446433855461606
R2 Score: 0.9988990709156199
RMSE: 0.111564
Entropy Value: 0.0005707690311533189
In [241]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[241]:
feature importance
1 diabetes_prevalence 0.740967
0 cardiovasc_death_rate 0.208080
5 median_age 0.027986
2 female_smokers 0.021149
3 life_expectancy 0.001681
4 aged_65_older 0.000137
In [242]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[242]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [243]:
country1 = 'Belgium'
country2 = 'Czechia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [244]:
df_updated
Out[244]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 6.63 0.900 0.0 32605.906 137.176 10493990 0.919258
5183 Czechia 12/26/2022 6.63 0.900 0.0 32605.906 137.176 10493990 0.919368
5184 Czechia 12/27/2022 6.63 0.900 0.0 32605.906 137.176 10493990 0.919431
5185 Czechia 12/28/2022 6.63 0.900 0.0 32605.906 137.176 10493990 0.919430
5186 Czechia 12/29/2022 6.63 0.900 0.0 32605.906 137.176 10493990 0.919575

2094 rows × 9 columns

In [245]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [246]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [247]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[247]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [248]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [249]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [250]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [251]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [252]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[252]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [253]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [254]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [255]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [256]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.998309889655373
In [257]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [258]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.014806362040415705
R2 Score: 0.9986903273023058
RMSE: 0.121681
Entropy Value: 0.0008238184433236955
In [259]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[259]:
feature importance
1 human_development_index 0.687524
2 extreme_poverty 0.166036
0 hospital_beds_per_thousand 0.087129
5 population 0.043040
3 gdp_per_capita 0.015953
4 population_density 0.000318
In [260]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[260]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [261]:
country1 = 'Estonia'
country2 = 'France'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [262]:
df_updated
Out[262]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 78.74 19.452 42.7 0.000000
6250 Estonia 1/18/2020 255.569 4.02 24.5 78.74 19.452 42.7 0.000000
6251 Estonia 2/5/2020 255.569 4.02 24.5 78.74 19.452 42.7 0.000000
6252 Estonia 2/6/2020 255.569 4.02 24.5 78.74 19.452 42.7 0.000000
6253 Estonia 2/7/2020 255.569 4.02 24.5 78.74 19.452 42.7 0.000000
... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 86.060 4.77 30.1 82.66 19.718 42.0 0.411710
9443 France 12/26/2022 86.060 4.77 30.1 82.66 19.718 42.0 0.411282
9444 France 12/27/2022 86.060 4.77 30.1 82.66 19.718 42.0 0.411730
9445 France 12/28/2022 86.060 4.77 30.1 82.66 19.718 42.0 0.411813
9446 France 12/29/2022 86.060 4.77 30.1 82.66 19.718 42.0 0.411892

2132 rows × 9 columns

In [263]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [264]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [265]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[265]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [266]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [267]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [268]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [269]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [270]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[270]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [271]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [272]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [273]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [274]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9972748488300354
In [275]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [276]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.06576710142677315
R2 Score: 0.9931527190173807
RMSE: 0.256451
Entropy Value: 0.003671594073428902
In [277]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[277]:
feature importance
1 diabetes_prevalence 0.682271
0 cardiovasc_death_rate 0.282642
5 median_age 0.012867
3 life_expectancy 0.010845
2 female_smokers 0.010472
4 aged_65_older 0.000904
In [278]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[278]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [279]:
country1 = 'Estonia'
country2 = 'France'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [280]:
df_updated
Out[280]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6250 Estonia 1/18/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6251 Estonia 2/5/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6252 Estonia 2/6/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6253 Estonia 2/7/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411710
9443 France 12/26/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411282
9444 France 12/27/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411730
9445 France 12/28/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411813
9446 France 12/29/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411892

2132 rows × 9 columns

In [281]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [282]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [283]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[283]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [284]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [285]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [286]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [287]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [288]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[288]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [289]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [290]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [291]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [292]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9978195582971618
In [293]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [294]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.07504882607957856
R2 Score: 0.9921863608333906
RMSE: 0.273950
Entropy Value: 0.004313799679754843
In [295]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[295]:
feature importance
1 human_development_index 0.760584
5 population 0.117334
0 hospital_beds_per_thousand 0.096855
2 extreme_poverty 0.013718
3 gdp_per_capita 0.011203
4 population_density 0.000306
In [296]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[296]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [297]:
country1 = 'Italy'
country2 = 'Portugal'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [298]:
df_updated
Out[298]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 83.51 23.021 47.9 0.735109

2098 rows × 9 columns

In [299]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [300]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [301]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[301]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [302]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [303]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [304]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [305]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [306]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[306]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [307]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [308]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [309]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [310]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9993040710802911
In [311]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [312]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.02093963561387409
R2 Score: 0.9981619530362098
RMSE: 0.144705
Entropy Value: 0.0012838103910311198
In [313]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[313]:
feature importance
5 median_age 0.791038
0 cardiovasc_death_rate 0.174679
1 diabetes_prevalence 0.022490
2 female_smokers 0.011428
3 life_expectancy 0.000305
4 aged_65_older 0.000060
In [314]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[314]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [315]:
country1 = 'Italy'
country2 = 'Portugal'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [316]:
df_updated
Out[316]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109

2098 rows × 9 columns

In [317]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [318]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [319]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[319]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [320]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [321]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [322]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [323]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [324]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[324]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [325]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [326]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [327]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [328]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9992988336861244
In [329]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [330]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.013594354001948517
R2 Score: 0.9988067098416261
RMSE: 0.116595
Entropy Value: 0.0007082932658725094
In [331]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[331]:
feature importance
5 population 0.526697
1 human_development_index 0.421271
2 extreme_poverty 0.034415
0 hospital_beds_per_thousand 0.010278
3 gdp_per_capita 0.007127
4 population_density 0.000212
In [332]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[332]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [333]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [334]:
df_updated
Out[334]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
15721 Serbia 2/26/2020 439.415 10.08 37.7 76.00 17.366 41.2 0.000000
15722 Serbia 2/27/2020 439.415 10.08 37.7 76.00 17.366 41.2 0.000000
15723 Serbia 2/28/2020 439.415 10.08 37.7 76.00 17.366 41.2 0.000000
15724 Serbia 2/29/2020 439.415 10.08 37.7 76.00 17.366 41.2 0.000000
15725 Serbia 3/1/2020 439.415 10.08 37.7 76.00 17.366 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 76.05 17.850 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 76.05 17.850 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 76.05 17.850 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 76.05 17.850 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 76.05 17.850 43.0 2.036403

2076 rows × 9 columns

In [335]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [336]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [337]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[337]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [338]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [339]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [340]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [341]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [342]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[342]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [343]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [344]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [345]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [346]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985721206840632
In [347]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [348]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.001614404165662264
R2 Score: 0.9990600803111381
RMSE: 0.040180
Entropy Value: 0.00041066443993060793
In [349]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[349]:
feature importance
0 cardiovasc_death_rate 0.450615
5 median_age 0.372406
1 diabetes_prevalence 0.154296
2 female_smokers 0.013306
3 life_expectancy 0.008851
4 aged_65_older 0.000526
In [350]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[350]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [351]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [352]:
df_updated
Out[352]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
15721 Serbia 2/26/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15722 Serbia 2/27/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15723 Serbia 2/28/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15724 Serbia 2/29/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15725 Serbia 3/1/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403

2076 rows × 9 columns

In [353]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [354]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [355]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[355]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [356]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [357]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [358]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [359]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [360]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[360]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [361]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [362]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [363]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [364]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979474526774726
In [365]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [366]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002076185405706244
R2 Score: 0.9987912273877525
RMSE: 0.045565
Entropy Value: 0.00045853981129780964
In [367]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[367]:
feature importance
5 population 0.663901
0 hospital_beds_per_thousand 0.186232
1 human_development_index 0.124847
2 extreme_poverty 0.013139
3 gdp_per_capita 0.011260
4 population_density 0.000621
In [368]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[368]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [369]:
country1 = 'Slovakia'
country2 = 'Spain'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [370]:
df_updated
Out[370]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 83.56 19.436 45.5 0.855148

2092 rows × 9 columns

In [371]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [372]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [373]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[373]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [374]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [375]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [376]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [377]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [378]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[378]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [379]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [380]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [381]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [382]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9994767987719648
In [383]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [384]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0015014430402105537
R2 Score: 0.9997248669986107
RMSE: 0.038748
Entropy Value: 0.00014000658074544682
In [385]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[385]:
feature importance
5 median_age 0.886071
1 diabetes_prevalence 0.062596
0 cardiovasc_death_rate 0.043915
2 female_smokers 0.006730
3 life_expectancy 0.000657
4 aged_65_older 0.000031
In [386]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[386]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [387]:
country1 = 'Slovakia'
country2 = 'Spain'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [388]:
df_updated
Out[388]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148

2092 rows × 9 columns

In [389]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [390]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [391]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[391]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [392]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [393]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [394]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [395]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [396]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[396]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [397]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [398]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [399]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [400]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988021206356716
In [401]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [402]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006694095157429868
R2 Score: 0.99877333575572
RMSE: 0.081817
Entropy Value: 0.00048549470785567154
In [403]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[403]:
feature importance
1 human_development_index 0.766794
5 population 0.138965
2 extreme_poverty 0.046793
0 hospital_beds_per_thousand 0.046793
3 gdp_per_capita 0.000507
4 population_density 0.000149
In [404]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[404]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [405]:
country1 = 'Switzerland'
country2 = 'Bulgaria'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [406]:
df_updated
Out[406]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 99.739 5.59 22.6 83.78 18.436 43.1 0.322922
14645 Switzerland 12/26/2022 99.739 5.59 22.6 83.78 18.436 43.1 0.322922
14646 Switzerland 12/27/2022 99.739 5.59 22.6 83.78 18.436 43.1 0.322922
14647 Switzerland 12/28/2022 99.739 5.59 22.6 83.78 18.436 43.1 0.323082
14648 Switzerland 12/29/2022 99.739 5.59 22.6 83.78 18.436 43.1 0.322149

2066 rows × 9 columns

In [407]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [408]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [409]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[409]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [410]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [411]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [412]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [413]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [414]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[414]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [415]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [416]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [417]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [418]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9661827427503995
In [419]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [420]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00378049910950887
R2 Score: 0.9984788009596705
RMSE: 0.061486
Entropy Value: 0.00038857518870859817
In [421]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[421]:
feature importance
5 median_age 0.327840
0 cardiovasc_death_rate 0.307169
1 diabetes_prevalence 0.301346
2 female_smokers 0.035199
4 aged_65_older 0.017073
3 life_expectancy 0.011373
In [422]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[422]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [423]:
country1 = 'Switzerland'
country2 = 'Bulgaria'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [424]:
df_updated
Out[424]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.50 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.50 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.50 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.50 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.50 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 4.530 0.955 0.03 57410.166 214.243 8740471 0.322922
14645 Switzerland 12/26/2022 4.530 0.955 0.03 57410.166 214.243 8740471 0.322922
14646 Switzerland 12/27/2022 4.530 0.955 0.03 57410.166 214.243 8740471 0.322922
14647 Switzerland 12/28/2022 4.530 0.955 0.03 57410.166 214.243 8740471 0.323082
14648 Switzerland 12/29/2022 4.530 0.955 0.03 57410.166 214.243 8740471 0.322149

2066 rows × 9 columns

In [425]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [426]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [427]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[427]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [428]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [429]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [430]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [431]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [432]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[432]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [433]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [434]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [435]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [436]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9660124510295903
In [437]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [438]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0036559826087191843
R2 Score: 0.9985289039688287
RMSE: 0.060465
Entropy Value: 0.00028880483478410657
In [439]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[439]:
feature importance
0 hospital_beds_per_thousand 0.513202
1 human_development_index 0.253525
5 population 0.179491
2 extreme_poverty 0.034966
3 gdp_per_capita 0.010116
4 population_density 0.008700
In [440]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[440]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [441]:
country1 = 'Cyprus'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [442]:
df_updated
Out[442]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 75.29 19.754 43.9 0.631969

2065 rows × 9 columns

In [443]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [444]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [445]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[445]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [446]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [447]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [448]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [449]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [450]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[450]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [451]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [452]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [453]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [454]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9968381533668923
In [455]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [456]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0007853700061385886
R2 Score: 0.9984435932019353
RMSE: 0.028024
Entropy Value: 0.00041728333907532875
In [457]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[457]:
feature importance
1 diabetes_prevalence 0.495285
5 median_age 0.278346
0 cardiovasc_death_rate 0.187393
2 female_smokers 0.032736
3 life_expectancy 0.004431
4 aged_65_older 0.001808
In [458]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[458]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [459]:
country1 = 'Cyprus'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [460]:
df_updated
Out[460]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631969

2065 rows × 9 columns

In [461]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [462]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [463]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[463]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [464]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [465]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [466]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [467]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [468]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[468]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [469]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [470]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [471]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [472]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9965848123547593
In [473]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [474]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0010839291039479871
R2 Score: 0.9978519237902916
RMSE: 0.032923
Entropy Value: 0.000511041568557603
In [475]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[475]:
feature importance
1 human_development_index 0.394633
0 hospital_beds_per_thousand 0.378747
5 population 0.194840
2 extreme_poverty 0.024974
3 gdp_per_capita 0.005458
4 population_density 0.001347
In [430]:
# Country Pair by Pair Analysis relative to female smokers
In [431]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[431]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [432]:
# Showing the pairings of countries based on female smokers (13 pairs of countries)
df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]

df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]

df_Iceland = df[(df.location == "Iceland")]
df_Italy = df[(df.location == "Italy")]

df_Portugal = df[(df.location == "Portugal")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Sweden = df[(df.location == "Sweden")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]

df_UnitedStates = df[(df.location == "United States")]
df_Austria = df[(df.location == "Austria")]

df_Belgium = df[(df.location == "Belgium")]
df_Estonia = df[(df.location == "Estonia")]

df_Ireland = df[(df.location == "Ireland")]
df_Latvia = df[(df.location == "Latvia")]

df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]

df_Romania = df[(df.location == "Romania")]
df_Slovakia = df[(df.location == "Slovakia")]

df_Spain = df[(df.location == "Spain")]
df_Switzerland = df[(df.location == "Switzerland")]

df_Bulgaria= df[(df.location == "Bulgaria")]
df_Czechia = df[(df.location == "Czechia")]

df_France = df[(df.location == "France")]
df_Serbia = df[(df.location == "Serbia")]
In [433]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [434]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [435]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[435]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [436]:
country1 = 'Canada'
country2 = 'Cyprus'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [437]:
df_updated
Out[437]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 16.6 82.43 16.984 41.4 1.093162

2099 rows × 9 columns

In [438]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [439]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [440]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[440]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [441]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [442]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [443]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [444]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [445]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[445]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [446]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [447]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [448]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [449]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9989396252151874
In [450]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [451]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0023675274789805804
R2 Score: 0.9993039593180916
RMSE: 0.048657
Entropy Value: 0.0002917411838061955
In [452]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[452]:
feature importance
0 cardiovasc_death_rate 0.459504
1 diabetes_prevalence 0.417500
5 median_age 0.090781
2 male_smokers 0.026672
3 life_expectancy 0.004896
4 aged_65_older 0.000647
In [453]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[453]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [454]:
country1 = 'Canada'
country2 = 'Cyprus'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [455]:
df_updated
Out[455]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.093162

2099 rows × 9 columns

In [456]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [457]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [458]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[458]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [459]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [460]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [461]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [462]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [463]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[463]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [464]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [465]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [466]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [467]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989605271130995
In [468]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [469]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002685282233557505
R2 Score: 0.999210541084082
RMSE: 0.051820
Entropy Value: 0.00034527877599041973
In [470]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[470]:
feature importance
5 population 0.655426
1 human_development_index 0.198181
0 hospital_beds_per_thousand 0.115957
2 extreme_poverty 0.020517
4 population_density 0.006514
3 gdp_per_capita 0.003405
In [471]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[471]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [472]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [473]:
df_updated
Out[473]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 18.8 80.90 19.677 42.3 0.00000
5188 Denmark 2/3/2020 114.767 6.41 18.8 80.90 19.677 42.3 0.00000
5189 Denmark 2/4/2020 114.767 6.41 18.8 80.90 19.677 42.3 0.00000
5190 Denmark 2/5/2020 114.767 6.41 18.8 80.90 19.677 42.3 0.00000
5191 Denmark 2/6/2020 114.767 6.41 18.8 80.90 19.677 42.3 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 153.507 5.76 22.6 81.91 21.228 42.8 0.55159
8372 Finland 12/26/2022 153.507 5.76 22.6 81.91 21.228 42.8 0.55159
8373 Finland 12/27/2022 153.507 5.76 22.6 81.91 21.228 42.8 0.55159
8374 Finland 12/28/2022 153.507 5.76 22.6 81.91 21.228 42.8 0.55159
8375 Finland 12/29/2022 153.507 5.76 22.6 81.91 21.228 42.8 0.55159

2128 rows × 9 columns

In [474]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [475]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [476]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[476]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [477]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [478]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [479]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [480]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [481]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[481]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [482]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [483]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [484]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [485]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9987535557486945
In [486]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [487]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007569049722669286
R2 Score: 0.9957062272826365
RMSE: 0.087000
Entropy Value: inf
In [488]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[488]:
feature importance
1 diabetes_prevalence 0.807879
5 median_age 0.060286
2 male_smokers 0.056874
0 cardiovasc_death_rate 0.053250
3 life_expectancy 0.021230
4 aged_65_older 0.000481
In [489]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[489]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [490]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [491]:
df_updated
Out[491]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5188 Denmark 2/3/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5189 Denmark 2/4/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5190 Denmark 2/5/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5191 Denmark 2/6/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8372 Finland 12/26/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8373 Finland 12/27/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8374 Finland 12/28/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8375 Finland 12/29/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159

2128 rows × 9 columns

In [492]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [493]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [494]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[494]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [495]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [496]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [497]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [498]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [499]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[499]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [500]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [501]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [502]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [503]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984953654596765
In [504]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [505]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008136863131264145
R2 Score: 0.9953841179278682
RMSE: 0.090205
Entropy Value: 0.0015398143907589235
In [506]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[506]:
feature importance
1 human_development_index 0.753936
5 population 0.092143
0 hospital_beds_per_thousand 0.070220
2 extreme_poverty 0.061815
3 gdp_per_capita 0.020882
4 population_density 0.001005
In [507]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[507]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [508]:
country1 = 'Iceland'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [509]:
df_updated
Out[509]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
20911 Iceland 2/28/2020 117.992 5.31 15.2 82.99 14.431 37.3 0.000000
20912 Iceland 2/29/2020 117.992 5.31 15.2 82.99 14.431 37.3 0.000000
20913 Iceland 3/1/2020 117.992 5.31 15.2 82.99 14.431 37.3 0.000000
20914 Iceland 3/2/2020 117.992 5.31 15.2 82.99 14.431 37.3 0.000000
20915 Iceland 3/3/2020 117.992 5.31 15.2 82.99 14.431 37.3 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 27.8 83.51 23.021 47.9 0.735109

2100 rows × 9 columns

In [510]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [511]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [512]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[512]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [513]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [514]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [515]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [516]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [517]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[517]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [518]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [519]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [520]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [521]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9993163648274013
In [522]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [523]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.015447420587881813
R2 Score: 0.9987332719640899
RMSE: 0.124288
Entropy Value: 0.0007928936109947494
In [524]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[524]:
feature importance
1 diabetes_prevalence 0.604268
0 cardiovasc_death_rate 0.336193
5 median_age 0.046313
2 male_smokers 0.012867
3 life_expectancy 0.000293
4 aged_65_older 0.000066
In [525]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[525]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [526]:
country1 = 'Iceland'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [527]:
df_updated
Out[527]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
20911 Iceland 2/28/2020 2.91 0.949 0.2 46482.958 3.404 372903 0.000000
20912 Iceland 2/29/2020 2.91 0.949 0.2 46482.958 3.404 372903 0.000000
20913 Iceland 3/1/2020 2.91 0.949 0.2 46482.958 3.404 372903 0.000000
20914 Iceland 3/2/2020 2.91 0.949 0.2 46482.958 3.404 372903 0.000000
20915 Iceland 3/3/2020 2.91 0.949 0.2 46482.958 3.404 372903 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109

2100 rows × 9 columns

In [528]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [529]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [530]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[530]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [531]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [532]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [533]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [534]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [535]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[535]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [536]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [537]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [538]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [539]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9994138916027948
In [540]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [541]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.03290481728582324
R2 Score: 0.9973017207413157
RMSE: 0.181397
Entropy Value: 0.0017395706724272842
In [542]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[542]:
feature importance
5 population 0.549859
1 human_development_index 0.349497
0 hospital_beds_per_thousand 0.042623
2 extreme_poverty 0.033844
3 gdp_per_capita 0.022440
4 population_density 0.001738
In [543]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[543]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [544]:
country1 = 'Portugal'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [545]:
df_updated
Out[545]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 30.0 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 30.0 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 30.0 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 30.0 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 30.0 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 25.0 81.32 19.062 44.5 0.536669

2096 rows × 9 columns

In [546]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [547]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [548]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[548]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [549]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [550]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [551]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [552]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [553]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[553]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [554]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [555]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [556]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [557]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986102850111352
In [558]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [559]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0023723489330236868
R2 Score: 0.9988124530308832
RMSE: 0.048707
Entropy Value: 0.0002848751285545325
In [560]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[560]:
feature importance
1 diabetes_prevalence 0.861403
0 cardiovasc_death_rate 0.093971
3 life_expectancy 0.021326
2 male_smokers 0.017737
5 median_age 0.005372
4 aged_65_older 0.000190
In [561]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[561]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [562]:
country1 = 'Portugal'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [563]:
df_updated
Out[563]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.536669

2096 rows × 9 columns

In [564]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [565]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [566]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[566]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [567]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [568]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [569]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [570]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [571]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[571]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [572]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [573]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [574]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [575]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9987931112255966
In [576]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [577]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002719376915070921
R2 Score: 0.9986387382697272
RMSE: 0.052148
Entropy Value: 0.00030312420047472904
In [578]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[578]:
feature importance
1 human_development_index 0.727041
5 population 0.146595
0 hospital_beds_per_thousand 0.068287
2 extreme_poverty 0.033873
3 gdp_per_capita 0.023341
4 population_density 0.000863
In [579]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[579]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [580]:
country1 = 'Sweden'
country2 = 'United Kingdom'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [581]:
df_updated
Out[581]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 24.7 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 24.7 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 24.7 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 24.7 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 24.7 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.9 82.80 19.985 41.0 0.816005

2126 rows × 9 columns

In [582]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [583]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [584]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[584]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [585]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [586]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [587]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [588]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [589]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[589]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [590]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [591]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [592]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [593]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9583436207367914
In [594]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [595]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.3215249530704957
R2 Score: 0.9881991027784613
RMSE: 0.567032
Entropy Value: 0.002642072340961003
In [596]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[596]:
feature importance
0 cardiovasc_death_rate 0.372382
5 median_age 0.354078
2 male_smokers 0.071201
1 diabetes_prevalence 0.068132
4 aged_65_older 0.067241
3 life_expectancy 0.066966
In [597]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[597]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [598]:
country1 = 'Sweden'
country2 = 'United Kingdom'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [599]:
df_updated
Out[599]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.2 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.2 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.2 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.816005

2126 rows × 9 columns

In [600]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [601]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [602]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[602]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [603]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [604]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [605]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [606]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [607]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[607]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [608]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [609]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [610]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [611]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9546833804604962
In [612]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [613]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.997926932618917
R2 Score: 0.9633731906218204
RMSE: 0.998963
Entropy Value: 0.005106077515730719
In [614]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[614]:
feature importance
1 human_development_index 0.576604
2 extreme_poverty 0.127571
5 population 0.122504
0 hospital_beds_per_thousand 0.113870
4 population_density 0.046708
3 gdp_per_capita 0.012743
In [615]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[615]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [616]:
country1 = 'United States'
country2 = 'Austria'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [617]:
df_updated
Out[617]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 24.6 78.86 15.413 38.3 1.084791

2112 rows × 9 columns

In [618]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [619]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [620]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[620]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [621]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [622]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [623]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [624]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [625]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[625]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [626]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [627]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [628]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [629]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9871989498238769
In [630]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [631]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0060186676398561656
R2 Score: 0.9961495778227457
RMSE: 0.077580
Entropy Value: 0.0005991889258687465
In [632]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[632]:
feature importance
1 diabetes_prevalence 0.689863
0 cardiovasc_death_rate 0.126162
5 median_age 0.108889
2 male_smokers 0.031105
4 aged_65_older 0.028256
3 life_expectancy 0.015725
In [633]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[633]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [634]:
country1 = 'United States'
country2 = 'Austria'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [635]:
df_updated
Out[635]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791

2112 rows × 9 columns

In [636]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [637]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [638]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[638]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [639]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [640]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [641]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [642]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [643]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[643]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [644]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [645]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [646]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [647]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9887577648734872
In [648]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [649]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00406836830426948
R2 Score: 0.9973972751975432
RMSE: 0.063784
Entropy Value: 0.0005141467774508717
In [650]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[650]:
feature importance
1 human_development_index 0.670685
5 population 0.231413
4 population_density 0.042005
2 extreme_poverty 0.032527
3 gdp_per_capita 0.019548
0 hospital_beds_per_thousand 0.003822
In [651]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[651]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [652]:
country1 = 'Belgium'
country2 = 'Estonia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [653]:
df_updated
Out[653]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 31.4 81.63 18.571 41.8 0.000000
1040 Belgium 2/5/2020 114.898 4.29 31.4 81.63 18.571 41.8 0.000000
1041 Belgium 2/6/2020 114.898 4.29 31.4 81.63 18.571 41.8 0.000000
1042 Belgium 2/7/2020 114.898 4.29 31.4 81.63 18.571 41.8 0.000000
1043 Belgium 2/8/2020 114.898 4.29 31.4 81.63 18.571 41.8 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 255.569 4.02 39.3 78.74 19.452 42.7 0.464100
7306 Estonia 12/26/2022 255.569 4.02 39.3 78.74 19.452 42.7 0.464100
7307 Estonia 12/27/2022 255.569 4.02 39.3 78.74 19.452 42.7 0.463645
7308 Estonia 12/28/2022 255.569 4.02 39.3 78.74 19.452 42.7 0.466423
7309 Estonia 12/29/2022 255.569 4.02 39.3 78.74 19.452 42.7 0.466423

2121 rows × 9 columns

In [654]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [655]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [656]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[656]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [657]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [658]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [659]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [660]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [661]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[661]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [662]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [663]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [664]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [665]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9988940139433922
In [666]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [667]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.014213131068063751
R2 Score: 0.9988739751027007
RMSE: 0.119219
Entropy Value: 0.0008755638819072048
In [668]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[668]:
feature importance
1 diabetes_prevalence 0.867373
0 cardiovasc_death_rate 0.099856
2 male_smokers 0.023081
5 median_age 0.008580
3 life_expectancy 0.001066
4 aged_65_older 0.000044
In [669]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[669]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [670]:
country1 = 'Belgium'
country2 = 'Estonia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [671]:
df_updated
Out[671]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7306 Estonia 12/26/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7307 Estonia 12/27/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.463645
7308 Estonia 12/28/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423
7309 Estonia 12/29/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423

2121 rows × 9 columns

In [672]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [673]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [674]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[674]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [675]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [676]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [677]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [678]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [679]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[679]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [680]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [681]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [682]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [683]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985104636986213
In [684]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [685]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.012715196293674761
R2 Score: 0.9989926478879171
RMSE: 0.112762
Entropy Value: 0.0008232624023628059
In [686]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[686]:
feature importance
1 human_development_index 0.744249
0 hospital_beds_per_thousand 0.102687
2 extreme_poverty 0.100773
5 population 0.041484
3 gdp_per_capita 0.010488
4 population_density 0.000319
In [687]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[687]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [688]:
country1 = 'Ireland'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [689]:
df_updated
Out[689]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 126.459 3.28 25.7 82.30 13.928 38.7 0.000000
18839 Ireland 3/1/2020 126.459 3.28 25.7 82.30 13.928 38.7 0.000000
18840 Ireland 3/2/2020 126.459 3.28 25.7 82.30 13.928 38.7 0.000000
18841 Ireland 3/3/2020 126.459 3.28 25.7 82.30 13.928 38.7 0.000000
18842 Ireland 3/4/2020 126.459 3.28 25.7 82.30 13.928 38.7 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 51.0 75.29 19.754 43.9 0.631969

2073 rows × 9 columns

In [690]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [691]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [692]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[692]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [693]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [694]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [695]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [696]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [697]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[697]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [698]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [699]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [700]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [701]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985163505355373
In [702]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [703]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0019485868775483287
R2 Score: 0.9991159573277734
RMSE: 0.044143
Entropy Value: 0.00044838336973350434
In [704]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[704]:
feature importance
0 cardiovasc_death_rate 0.492280
1 diabetes_prevalence 0.451240
5 median_age 0.028929
2 male_smokers 0.025303
3 life_expectancy 0.001392
4 aged_65_older 0.000856
In [705]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[705]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [706]:
country1 = 'Ireland'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [707]:
df_updated
Out[707]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
18838 Ireland 2/29/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18839 Ireland 3/1/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18840 Ireland 3/2/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18841 Ireland 3/3/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18842 Ireland 3/4/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631969

2073 rows × 9 columns

In [708]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [709]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [710]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[710]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [711]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [712]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [713]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [714]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [715]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[715]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [716]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [717]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [718]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [719]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9984706478843881
In [720]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [721]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0019969715660626214
R2 Score: 0.9990940059691648
RMSE: 0.044687
Entropy Value: 0.0003069312117218654
In [722]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[722]:
feature importance
1 human_development_index 0.457937
0 hospital_beds_per_thousand 0.370522
5 population 0.147929
2 extreme_poverty 0.021240
3 gdp_per_capita 0.001512
4 population_density 0.000862
In [723]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[723]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [724]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [725]:
df_updated
Out[725]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 128.275 4.42 26.0 82.25 14.312 39.7 0.377872
17796 Luxembourg 12/26/2022 128.275 4.42 26.0 82.25 14.312 39.7 0.377872
17797 Luxembourg 12/27/2022 128.275 4.42 26.0 82.25 14.312 39.7 0.377872
17798 Luxembourg 12/28/2022 128.275 4.42 26.0 82.25 14.312 39.7 0.377872
17799 Luxembourg 12/29/2022 128.275 4.42 26.0 82.25 14.312 39.7 0.377872

2078 rows × 9 columns

In [726]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [727]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [728]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[728]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [729]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [730]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [731]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [732]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [733]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[733]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [734]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [735]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [736]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [737]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990232848825181
In [738]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [739]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005227635293595947
R2 Score: 0.9993186676908599
RMSE: 0.072302
Entropy Value: 0.0006067133512587061
In [740]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[740]:
feature importance
1 diabetes_prevalence 0.643346
0 cardiovasc_death_rate 0.233654
5 median_age 0.067902
2 male_smokers 0.048321
3 life_expectancy 0.006387
4 aged_65_older 0.000389
In [741]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[741]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [742]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [743]:
df_updated
Out[743]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17796 Luxembourg 12/26/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17797 Luxembourg 12/27/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17798 Luxembourg 12/28/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17799 Luxembourg 12/29/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872

2078 rows × 9 columns

In [744]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [745]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [746]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[746]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [747]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [748]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [749]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [750]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [751]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[751]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [752]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [753]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [754]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [755]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987243093178199
In [756]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [757]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007716113263742701
R2 Score: 0.9989943374064345
RMSE: 0.087841
Entropy Value: 0.0017161933932385992
In [758]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[758]:
feature importance
1 human_development_index 0.482469
2 extreme_poverty 0.244199
0 hospital_beds_per_thousand 0.199908
5 population 0.057787
3 gdp_per_capita 0.014620
4 population_density 0.001017
In [759]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[759]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [760]:
country1 = 'Romania'
country2 = 'Slovakia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [761]:
df_updated
Out[761]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 37.7 77.54 15.07 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 37.7 77.54 15.07 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 37.7 77.54 15.07 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 37.7 77.54 15.07 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 37.7 77.54 15.07 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 37.1 76.05 17.85 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 37.1 76.05 17.85 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 37.1 76.05 17.85 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 37.1 76.05 17.85 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 37.1 76.05 17.85 43.0 2.036403

2067 rows × 9 columns

In [762]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [763]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [764]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[764]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [765]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [766]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [767]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [768]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [769]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[769]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [770]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [771]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [772]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [773]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989475810367674
In [774]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [775]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0009858239346901049
R2 Score: 0.9994441257129435
RMSE: 0.031398
Entropy Value: 0.00012076580400027207
In [776]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[776]:
feature importance
5 median_age 0.772914
1 diabetes_prevalence 0.149105
0 cardiovasc_death_rate 0.068522
2 male_smokers 0.008368
3 life_expectancy 0.000947
4 aged_65_older 0.000144
In [777]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[777]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [778]:
country1 = 'Romania'
country2 = 'Slovakia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [779]:
df_updated
Out[779]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.820 0.860 0.7 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.820 0.860 0.7 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.820 0.860 0.7 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.820 0.860 0.7 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.820 0.860 0.7 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403

2067 rows × 9 columns

In [780]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [781]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [782]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[782]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [783]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [784]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [785]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [786]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [787]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[787]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [788]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [789]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [790]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [791]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.998030341444381
In [792]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [793]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0026098629278783294
R2 Score: 0.99852838256072
RMSE: 0.051087
Entropy Value: 0.0004080770571716045
In [794]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[794]:
feature importance
5 population 0.746990
1 human_development_index 0.214658
0 hospital_beds_per_thousand 0.020482
2 extreme_poverty 0.015312
3 gdp_per_capita 0.002071
4 population_density 0.000487
In [795]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[795]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [796]:
country1 = 'Spain'
country2 = 'Switzerland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [797]:
df_updated
Out[797]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 99.739 5.59 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 99.739 5.59 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 99.739 5.59 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 99.739 5.59 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 99.739 5.59 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 31.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 31.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 31.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 31.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 31.4 83.56 19.436 45.5 0.855148

2102 rows × 9 columns

In [798]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [799]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [800]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[800]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [801]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [802]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [803]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [804]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [805]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[805]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [806]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [807]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [808]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [809]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9991186723177916
In [810]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [811]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005248655333758953
R2 Score: 0.9991504781499669
RMSE: 0.072448
Entropy Value: inf
In [812]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[812]:
feature importance
0 cardiovasc_death_rate 0.534733
5 median_age 0.408620
2 male_smokers 0.039876
1 diabetes_prevalence 0.012956
3 life_expectancy 0.003662
4 aged_65_older 0.000153
In [813]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[813]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [814]:
country1 = 'Spain'
country2 = 'Switzerland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [815]:
df_updated
Out[815]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.97 0.904 1.00 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 2.97 0.904 1.00 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 2.97 0.904 1.00 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 2.97 0.904 1.00 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 2.97 0.904 1.00 34272.360 93.105 47558632 0.855148

2102 rows × 9 columns

In [816]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [817]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [818]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[818]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [819]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [820]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [821]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [822]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [823]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[823]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [824]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [825]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [826]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [827]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987177894783118
In [828]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [829]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008170528399905543
R2 Score: 0.9986775579723456
RMSE: 0.090391
Entropy Value: 0.0007620940166931194
In [830]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[830]:
feature importance
1 human_development_index 0.522591
5 population 0.301978
0 hospital_beds_per_thousand 0.108450
2 extreme_poverty 0.033910
3 gdp_per_capita 0.032646
4 population_density 0.000425
In [831]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[831]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [832]:
country1 = 'Bulgaria'
country2 = 'Czechia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [833]:
df_updated
Out[833]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 227.485 6.82 38.3 79.38 19.027 43.3 0.919258
5183 Czechia 12/26/2022 227.485 6.82 38.3 79.38 19.027 43.3 0.919368
5184 Czechia 12/27/2022 227.485 6.82 38.3 79.38 19.027 43.3 0.919431
5185 Czechia 12/28/2022 227.485 6.82 38.3 79.38 19.027 43.3 0.919430
5186 Czechia 12/29/2022 227.485 6.82 38.3 79.38 19.027 43.3 0.919575

2061 rows × 9 columns

In [834]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [835]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [836]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[836]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [837]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [838]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [839]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [840]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [841]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[841]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [842]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [843]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [844]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [845]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9561647329294217
In [846]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [847]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002649612864395656
R2 Score: 0.9983839393744748
RMSE: 0.051474
Entropy Value: 0.00027256482883888917
In [848]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[848]:
feature importance
5 median_age 0.464105
0 cardiovasc_death_rate 0.433036
2 male_smokers 0.045127
4 aged_65_older 0.026142
1 diabetes_prevalence 0.021133
3 life_expectancy 0.010457
In [849]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[849]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [850]:
country1 = 'Bulgaria'
country2 = 'Czechia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [851]:
df_updated
Out[851]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919258
5183 Czechia 12/26/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919368
5184 Czechia 12/27/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919431
5185 Czechia 12/28/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919430
5186 Czechia 12/29/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919575

2061 rows × 9 columns

In [852]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [853]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [854]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[854]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [855]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [856]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [857]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [858]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [859]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[859]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [860]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [861]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [862]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [863]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9555145568773449
In [864]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [865]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002769256603078144
R2 Score: 0.9983109658703929
RMSE: 0.052624
Entropy Value: 0.00028051806238002606
In [866]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[866]:
feature importance
0 hospital_beds_per_thousand 0.823913
5 population 0.128464
2 extreme_poverty 0.019683
1 human_development_index 0.016833
4 population_density 0.005945
3 gdp_per_capita 0.005162
In [867]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[867]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [868]:
country1 = 'France'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [869]:
df_updated
Out[869]:
location date cardiovasc_death_rate diabetes_prevalence male_smokers life_expectancy aged_65_older median_age Mortality Rate
8376 France 1/24/2020 86.060 4.77 35.6 82.66 19.718 42.0 0.000000
8377 France 1/25/2020 86.060 4.77 35.6 82.66 19.718 42.0 0.000000
8378 France 1/26/2020 86.060 4.77 35.6 82.66 19.718 42.0 0.000000
8379 France 1/27/2020 86.060 4.77 35.6 82.66 19.718 42.0 0.000000
8380 France 1/28/2020 86.060 4.77 35.6 82.66 19.718 42.0 0.000000
... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 439.415 10.08 40.2 76.00 17.366 41.2 0.717058
16755 Serbia 12/26/2022 439.415 10.08 40.2 76.00 17.366 41.2 0.716963
16756 Serbia 12/27/2022 439.415 10.08 40.2 76.00 17.366 41.2 0.716677
16757 Serbia 12/28/2022 439.415 10.08 40.2 76.00 17.366 41.2 0.716395
16758 Serbia 12/29/2022 439.415 10.08 40.2 76.00 17.366 41.2 0.716205

2109 rows × 9 columns

In [870]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [871]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [872]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[872]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [873]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [874]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [875]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [876]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [877]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[877]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [878]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [879]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [880]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [881]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9941440903988384
In [882]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [883]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.05493983728553627
R2 Score: 0.9952696565378223
RMSE: 0.234392
Entropy Value: 0.0017421697067628312
In [884]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[884]:
feature importance
1 diabetes_prevalence 0.758670
0 cardiovasc_death_rate 0.166348
4 aged_65_older 0.025166
5 median_age 0.025088
2 male_smokers 0.013602
3 life_expectancy 0.011126
In [885]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[885]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [886]:
country1 = 'France'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [887]:
df_updated
Out[887]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
8376 France 1/24/2020 5.980 0.901 0.02 38605.671 122.578 67813000 0.000000
8377 France 1/25/2020 5.980 0.901 0.02 38605.671 122.578 67813000 0.000000
8378 France 1/26/2020 5.980 0.901 0.02 38605.671 122.578 67813000 0.000000
8379 France 1/27/2020 5.980 0.901 0.02 38605.671 122.578 67813000 0.000000
8380 France 1/28/2020 5.980 0.901 0.02 38605.671 122.578 67813000 0.000000
... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.717058
16755 Serbia 12/26/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716963
16756 Serbia 12/27/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716677
16757 Serbia 12/28/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716395
16758 Serbia 12/29/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716205

2109 rows × 9 columns

In [888]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [889]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [890]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[890]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [891]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [892]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [893]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [894]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [895]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[895]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [896]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [897]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [898]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [899]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9930827422814454
In [900]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [901]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.07948072065736031
R2 Score: 0.9931566759949306
RMSE: 0.281923
Entropy Value: 0.0020226363731710244
In [902]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[902]:
feature importance
1 human_development_index 0.622370
2 extreme_poverty 0.146551
0 hospital_beds_per_thousand 0.105804
5 population 0.055100
4 population_density 0.048866
3 gdp_per_capita 0.021310
In [25]:
# Country Pair by Pair Analysis relative to life expectancy
In [26]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[26]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [27]:
# Showing the pairings of countries based on life expectancy (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]

df_Canada = df[(df.location == "Canada")]
df_Cyprus = df[(df.location == "Cyprus")]

df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]

df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]

df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]

df_Portugal = df[(df.location == "Portugal")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Spain = df[(df.location == "Spain")]
df_Sweden = df[(df.location == "Sweden")]

df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]

df_Czechia = df[(df.location == "Czechia")]
df_Estonia = df[(df.location == "Estonia")]

df_UnitedStates = df[(df.location == "United States")]
df_Bulgaria = df[(df.location == "Bulgaria")]

df_Latvia = df[(df.location == "Latvia")]
df_Romania = df[(df.location == "Romania")]

df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]
In [28]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [29]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [30]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[30]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [31]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [32]:
df_updated
Out[32]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 114.898 4.29 25.1 31.4 18.571 41.8 0.711787
2095 Belgium 12/26/2022 114.898 4.29 25.1 31.4 18.571 41.8 0.711787
2096 Belgium 12/27/2022 114.898 4.29 25.1 31.4 18.571 41.8 0.711787
2097 Belgium 12/28/2022 114.898 4.29 25.1 31.4 18.571 41.8 0.711787
2098 Belgium 12/29/2022 114.898 4.29 25.1 31.4 18.571 41.8 0.711787

2099 rows × 9 columns

In [33]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [34]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [35]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[35]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [36]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [37]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [38]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [39]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [40]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[40]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [41]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [42]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [43]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [44]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986706695832035
In [45]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [46]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01609753378306672
R2 Score: 0.998640384719372
RMSE: 0.126876
Entropy Value: 0.0006364314394538599
In [47]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[47]:
feature importance
0 cardiovasc_death_rate 0.473470
1 diabetes_prevalence 0.471525
2 female_smokers 0.024737
5 median_age 0.021757
3 male_smokers 0.008255
4 aged_65_older 0.000256
In [48]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[48]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [49]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [50]:
df_updated
Out[50]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2095 Belgium 12/26/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2096 Belgium 12/27/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2097 Belgium 12/28/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787
2098 Belgium 12/29/2022 5.64 0.931 0.2 42658.576 375.564 11655923 0.711787

2099 rows × 9 columns

In [51]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [52]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [53]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[53]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [54]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [55]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [56]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [57]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [58]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[58]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [59]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [60]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [61]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [62]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985453142557521
In [63]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [64]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01731215895223598
R2 Score: 0.9985377961513036
RMSE: 0.131576
Entropy Value: 0.000755209259742723
In [65]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[65]:
feature importance
1 human_development_index 0.604442
5 population 0.227045
0 hospital_beds_per_thousand 0.118292
2 extreme_poverty 0.041327
3 gdp_per_capita 0.008604
4 population_density 0.000290
In [66]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[66]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [67]:
country1 = 'Canada'
country2 = 'Cyprus'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [68]:
df_updated
Out[68]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 16.984 41.4 1.093162

2099 rows × 9 columns

In [69]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [70]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [71]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[71]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [72]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [73]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [74]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [75]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [76]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[76]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [77]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [78]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [79]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [80]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.999053878094869
In [81]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [82]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002132122245038013
R2 Score: 0.9993731672242354
RMSE: 0.046175
Entropy Value: 0.00042462368096495684
In [83]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[83]:
feature importance
0 cardiovasc_death_rate 0.472233
1 diabetes_prevalence 0.433225
5 median_age 0.076136
2 female_smokers 0.016640
3 male_smokers 0.001581
4 aged_65_older 0.000185
In [84]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[84]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [85]:
country1 = 'Canada'
country2 = 'Cyprus'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [86]:
df_updated
Out[86]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.4 0.887 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.929 0.50 44017.591 4.037 38454328 1.093162

2099 rows × 9 columns

In [87]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [88]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [89]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[89]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [90]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [91]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [92]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [93]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [94]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[94]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [95]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [96]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [97]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [98]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989605271130995
In [99]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [100]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002685282233557505
R2 Score: 0.999210541084082
RMSE: 0.051820
Entropy Value: 0.00034527877599041973
In [101]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[101]:
feature importance
5 population 0.655426
1 human_development_index 0.198181
0 hospital_beds_per_thousand 0.115957
2 extreme_poverty 0.020517
4 population_density 0.006514
3 gdp_per_capita 0.003405
In [102]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[102]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [103]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [104]:
df_updated
Out[104]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 19.3 18.8 19.677 42.3 0.00000
5188 Denmark 2/3/2020 114.767 6.41 19.3 18.8 19.677 42.3 0.00000
5189 Denmark 2/4/2020 114.767 6.41 19.3 18.8 19.677 42.3 0.00000
5190 Denmark 2/5/2020 114.767 6.41 19.3 18.8 19.677 42.3 0.00000
5191 Denmark 2/6/2020 114.767 6.41 19.3 18.8 19.677 42.3 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 153.507 5.76 18.3 22.6 21.228 42.8 0.55159
8372 Finland 12/26/2022 153.507 5.76 18.3 22.6 21.228 42.8 0.55159
8373 Finland 12/27/2022 153.507 5.76 18.3 22.6 21.228 42.8 0.55159
8374 Finland 12/28/2022 153.507 5.76 18.3 22.6 21.228 42.8 0.55159
8375 Finland 12/29/2022 153.507 5.76 18.3 22.6 21.228 42.8 0.55159

2128 rows × 9 columns

In [105]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [106]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [107]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[107]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [108]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [109]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [110]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [111]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [112]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[112]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [113]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [114]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [115]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [116]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.998779521821126
In [117]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [118]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008955815256083285
R2 Score: 0.9949195425294739
RMSE: 0.094635
Entropy Value: 0.0018700174545439696
In [119]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[119]:
feature importance
1 diabetes_prevalence 0.825645
0 cardiovasc_death_rate 0.069251
2 female_smokers 0.051373
5 median_age 0.031775
3 male_smokers 0.020486
4 aged_65_older 0.001471
In [120]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[120]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [121]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [122]:
df_updated
Out[122]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5188 Denmark 2/3/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5189 Denmark 2/4/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5190 Denmark 2/5/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5191 Denmark 2/6/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8372 Finland 12/26/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8373 Finland 12/27/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8374 Finland 12/28/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8375 Finland 12/29/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159

2128 rows × 9 columns

In [123]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [124]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [125]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[125]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [126]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [127]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [128]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [129]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [130]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[130]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [131]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [132]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [133]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [134]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984953654596765
In [135]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [136]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008136863131264145
R2 Score: 0.9953841179278682
RMSE: 0.090205
Entropy Value: 0.0015398143907589235
In [137]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[137]:
feature importance
1 human_development_index 0.753936
5 population 0.092143
0 hospital_beds_per_thousand 0.070220
2 extreme_poverty 0.061815
3 gdp_per_capita 0.020882
4 population_density 0.001005
In [138]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[138]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [139]:
country1 = 'France'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [140]:
df_updated
Out[140]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
8376 France 1/24/2020 86.060 4.77 30.1 35.6 19.718 42.0 0.00000
8377 France 1/25/2020 86.060 4.77 30.1 35.6 19.718 42.0 0.00000
8378 France 1/26/2020 86.060 4.77 30.1 35.6 19.718 42.0 0.00000
8379 France 1/27/2020 86.060 4.77 30.1 35.6 19.718 42.0 0.00000
8380 France 1/28/2020 86.060 4.77 30.1 35.6 19.718 42.0 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 14.431 37.3 0.11011

2107 rows × 9 columns

In [141]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [142]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [143]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[143]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [144]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [145]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [146]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [147]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [148]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[148]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [149]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [150]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [151]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [152]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9960173520783563
In [153]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [154]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.042620757311671066
R2 Score: 0.9966108339202479
RMSE: 0.206448
Entropy Value: 0.0015090598930850515
In [155]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[155]:
feature importance
1 diabetes_prevalence 0.764287
0 cardiovasc_death_rate 0.178746
5 median_age 0.031670
3 male_smokers 0.009990
2 female_smokers 0.008057
4 aged_65_older 0.007250
In [156]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[156]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [157]:
country1 = 'France'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [158]:
df_updated
Out[158]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
8376 France 1/24/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8377 France 1/25/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8378 France 1/26/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8379 France 1/27/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8380 France 1/28/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011

2107 rows × 9 columns

In [159]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [160]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [161]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[161]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [162]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [163]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [164]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [165]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [166]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[166]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [167]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [168]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [169]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [170]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9946715554617531
In [171]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [172]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.06349754860870421
R2 Score: 0.9949507293753999
RMSE: 0.251987
Entropy Value: 0.002273841175109527
In [173]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[173]:
feature importance
1 human_development_index 0.739557
0 hospital_beds_per_thousand 0.133697
4 population_density 0.060550
5 population 0.031155
3 gdp_per_capita 0.017913
2 extreme_poverty 0.017128
In [174]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[174]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [175]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [176]:
df_updated
Out[176]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 126.459 3.28 23.0 25.7 13.928 38.7 0.000000
18839 Ireland 3/1/2020 126.459 3.28 23.0 25.7 13.928 38.7 0.000000
18840 Ireland 3/2/2020 126.459 3.28 23.0 25.7 13.928 38.7 0.000000
18841 Ireland 3/3/2020 126.459 3.28 23.0 25.7 13.928 38.7 0.000000
18842 Ireland 3/4/2020 126.459 3.28 23.0 25.7 13.928 38.7 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 23.021 47.9 0.735109

2099 rows × 9 columns

In [177]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [178]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [179]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[179]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [180]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [181]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [182]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [183]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [184]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[184]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [185]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [186]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [187]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [188]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.999154237841806
In [189]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [190]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006427766546869428
R2 Score: 0.999471944421523
RMSE: 0.080173
Entropy Value: 0.00028870844431895304
In [191]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[191]:
feature importance
5 median_age 0.844150
1 diabetes_prevalence 0.100753
0 cardiovasc_death_rate 0.043883
2 female_smokers 0.008767
3 male_smokers 0.002337
4 aged_65_older 0.000110
In [192]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[192]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [193]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [194]:
df_updated
Out[194]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
18838 Ireland 2/29/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18839 Ireland 3/1/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18840 Ireland 3/2/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18841 Ireland 3/3/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18842 Ireland 3/4/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109

2099 rows × 9 columns

In [195]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [196]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [197]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[197]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [198]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [199]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [200]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [201]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [202]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[202]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [203]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [204]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [205]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [206]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988984971748192
In [207]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [208]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009889496543689754
R2 Score: 0.9991875554626719
RMSE: 0.099446
Entropy Value: 0.00044270709478587373
In [209]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[209]:
feature importance
1 human_development_index 0.702587
0 hospital_beds_per_thousand 0.200820
5 population 0.042593
2 extreme_poverty 0.026734
3 gdp_per_capita 0.018469
4 population_density 0.008797
In [210]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[210]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [211]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [212]:
df_updated
Out[212]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 128.275 4.42 20.9 26.0 14.312 39.7 0.377872
17796 Luxembourg 12/26/2022 128.275 4.42 20.9 26.0 14.312 39.7 0.377872
17797 Luxembourg 12/27/2022 128.275 4.42 20.9 26.0 14.312 39.7 0.377872
17798 Luxembourg 12/28/2022 128.275 4.42 20.9 26.0 14.312 39.7 0.377872
17799 Luxembourg 12/29/2022 128.275 4.42 20.9 26.0 14.312 39.7 0.377872

2078 rows × 9 columns

In [213]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [214]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [215]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[215]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [216]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [217]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [218]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [219]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [220]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[220]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [221]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [222]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [223]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [224]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989910619474511
In [225]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [226]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007086354331332736
R2 Score: 0.9990764156470773
RMSE: 0.084180
Entropy Value: 0.000702998195189773
In [227]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[227]:
feature importance
1 diabetes_prevalence 0.685790
0 cardiovasc_death_rate 0.259016
5 median_age 0.038685
2 female_smokers 0.013791
3 male_smokers 0.002615
4 aged_65_older 0.000103
In [228]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[228]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [229]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [230]:
df_updated
Out[230]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17796 Luxembourg 12/26/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17797 Luxembourg 12/27/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17798 Luxembourg 12/28/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872
17799 Luxembourg 12/29/2022 4.51 0.916 0.2 94277.965 231.447 647601 0.377872

2078 rows × 9 columns

In [231]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [232]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [233]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[233]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [234]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [235]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [236]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [237]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [238]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[238]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [239]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [240]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [241]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [242]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987243093178199
In [243]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [244]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007716113263742701
R2 Score: 0.9989943374064345
RMSE: 0.087841
Entropy Value: 0.0017161933932385992
In [245]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[245]:
feature importance
1 human_development_index 0.482469
2 extreme_poverty 0.244199
0 hospital_beds_per_thousand 0.199908
5 population 0.057787
3 gdp_per_capita 0.014620
4 population_density 0.001017
In [246]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[246]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [247]:
country1 = 'Portugal'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [248]:
df_updated
Out[248]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 19.062 44.5 0.536669

2096 rows × 9 columns

In [249]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [250]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [251]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[251]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [252]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [253]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [254]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [255]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [256]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[256]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [257]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [258]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [259]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [260]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985698342591484
In [261]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [262]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002959592405574864
R2 Score: 0.9985184915498152
RMSE: 0.054402
Entropy Value: 0.00030518164442021166
In [263]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[263]:
feature importance
1 diabetes_prevalence 0.865331
0 cardiovasc_death_rate 0.085650
3 male_smokers 0.024057
2 female_smokers 0.020765
5 median_age 0.003633
4 aged_65_older 0.000564
In [264]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[264]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [265]:
country1 = 'Portugal'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [266]:
df_updated
Out[266]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.536669

2096 rows × 9 columns

In [267]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [268]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [269]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[269]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [270]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [271]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [272]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [273]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [274]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[274]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [275]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [276]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [277]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [278]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9987931112255966
In [279]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [280]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002719376915070921
R2 Score: 0.9986387382697272
RMSE: 0.052148
Entropy Value: 0.00030312420047472904
In [281]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[281]:
feature importance
1 human_development_index 0.727041
5 population 0.146595
0 hospital_beds_per_thousand 0.068287
2 extreme_poverty 0.033873
3 gdp_per_capita 0.023341
4 population_density 0.000863
In [282]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[282]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [283]:
country1 = 'Spain'
country2 = 'Sweden'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [284]:
df_updated
Out[284]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
23011 Sweden 2/1/2020 133.982 4.79 18.8 18.9 19.985 41.0 0.000000
23012 Sweden 2/2/2020 133.982 4.79 18.8 18.9 19.985 41.0 0.000000
23013 Sweden 2/3/2020 133.982 4.79 18.8 18.9 19.985 41.0 0.000000
23014 Sweden 2/4/2020 133.982 4.79 18.8 18.9 19.985 41.0 0.000000
23015 Sweden 2/5/2020 133.982 4.79 18.8 18.9 19.985 41.0 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 31.4 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 31.4 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 31.4 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 31.4 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 31.4 19.436 45.5 0.855148

2126 rows × 9 columns

In [285]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [286]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [287]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[287]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [288]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [289]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [290]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [291]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [292]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[292]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [293]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [294]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [295]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [296]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.998867876669354
In [297]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [298]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009929112730334729
R2 Score: 0.9988353451448788
RMSE: 0.099645
Entropy Value: inf
In [299]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[299]:
feature importance
1 diabetes_prevalence 0.943762
2 female_smokers 0.031796
0 cardiovasc_death_rate 0.012842
3 male_smokers 0.006060
5 median_age 0.005185
4 aged_65_older 0.000355
In [300]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[300]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [301]:
country1 = 'Spain'
country2 = 'Sweden'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [302]:
df_updated
Out[302]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
23011 Sweden 2/1/2020 2.22 0.945 0.5 46949.283 24.718 10549349 0.000000
23012 Sweden 2/2/2020 2.22 0.945 0.5 46949.283 24.718 10549349 0.000000
23013 Sweden 2/3/2020 2.22 0.945 0.5 46949.283 24.718 10549349 0.000000
23014 Sweden 2/4/2020 2.22 0.945 0.5 46949.283 24.718 10549349 0.000000
23015 Sweden 2/5/2020 2.22 0.945 0.5 46949.283 24.718 10549349 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148

2126 rows × 9 columns

In [303]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [304]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [305]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[305]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [306]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [307]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [308]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [309]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [310]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[310]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [311]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [312]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [313]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [314]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987460071894578
In [315]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [316]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.014248650470063625
R2 Score: 0.9983286764487842
RMSE: 0.119368
Entropy Value: 0.0007314147275536079
In [317]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[317]:
feature importance
1 human_development_index 0.928746
2 extreme_poverty 0.039890
5 population 0.023575
3 gdp_per_capita 0.006665
0 hospital_beds_per_thousand 0.000593
4 population_density 0.000530
In [318]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[318]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [319]:
country1 = 'Switzerland'
country2 = 'United Kingdom'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [320]:
df_updated
Out[320]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 99.739 5.59 22.6 28.9 18.436 43.1 0.322922
14645 Switzerland 12/26/2022 99.739 5.59 22.6 28.9 18.436 43.1 0.322922
14646 Switzerland 12/27/2022 99.739 5.59 22.6 28.9 18.436 43.1 0.322922
14647 Switzerland 12/28/2022 99.739 5.59 22.6 28.9 18.436 43.1 0.323082
14648 Switzerland 12/29/2022 99.739 5.59 22.6 28.9 18.436 43.1 0.322149

2102 rows × 9 columns

In [321]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [322]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [323]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[323]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [324]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [325]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [326]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [327]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [328]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[328]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [329]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [330]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [331]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [332]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9551717291204419
In [333]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [334]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.14991967159078062
R2 Score: 0.9940801441598267
RMSE: 0.387195
Entropy Value: 0.0024482743045327773
In [335]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[335]:
feature importance
0 cardiovasc_death_rate 0.481112
5 median_age 0.306261
1 diabetes_prevalence 0.069334
2 female_smokers 0.062658
4 aged_65_older 0.045872
3 male_smokers 0.034763
In [336]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[336]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [337]:
country1 = 'Switzerland'
country2 = 'United Kingdom'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [338]:
df_updated
Out[338]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.20 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.20 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.20 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.20 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.20 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322922
14645 Switzerland 12/26/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322922
14646 Switzerland 12/27/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322922
14647 Switzerland 12/28/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.323082
14648 Switzerland 12/29/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322149

2102 rows × 9 columns

In [339]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [340]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [341]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[341]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [342]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [343]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [344]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [345]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [346]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[346]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [347]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [348]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [349]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [350]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.952952674424612
In [351]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [352]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.8303105819555134
R2 Score: 0.9672136491789828
RMSE: 0.911214
Entropy Value: 0.006240288209575662
In [353]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[353]:
feature importance
5 population 0.314746
1 human_development_index 0.291846
2 extreme_poverty 0.138111
0 hospital_beds_per_thousand 0.116382
4 population_density 0.079420
3 gdp_per_capita 0.059494
In [354]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[354]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [355]:
country1 = 'Czechia'
country2 = 'Estonia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [356]:
df_updated
Out[356]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
4153 Czechia 3/1/2020 227.485 6.82 30.5 38.3 19.027 43.3 0.000000
4154 Czechia 3/2/2020 227.485 6.82 30.5 38.3 19.027 43.3 0.000000
4155 Czechia 3/3/2020 227.485 6.82 30.5 38.3 19.027 43.3 0.000000
4156 Czechia 3/4/2020 227.485 6.82 30.5 38.3 19.027 43.3 0.000000
4157 Czechia 3/5/2020 227.485 6.82 30.5 38.3 19.027 43.3 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 255.569 4.02 24.5 39.3 19.452 42.7 0.464100
7306 Estonia 12/26/2022 255.569 4.02 24.5 39.3 19.452 42.7 0.464100
7307 Estonia 12/27/2022 255.569 4.02 24.5 39.3 19.452 42.7 0.463645
7308 Estonia 12/28/2022 255.569 4.02 24.5 39.3 19.452 42.7 0.466423
7309 Estonia 12/29/2022 255.569 4.02 24.5 39.3 19.452 42.7 0.466423

2095 rows × 9 columns

In [357]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [358]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [359]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[359]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [360]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [361]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [362]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [363]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [364]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[364]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [365]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [366]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [367]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [368]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988914645141207
In [369]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [370]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0009737929752882011
R2 Score: 0.998586037150187
RMSE: 0.031206
Entropy Value: 0.00030888828385945916
In [371]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[371]:
feature importance
1 diabetes_prevalence 0.829546
5 median_age 0.073210
0 cardiovasc_death_rate 0.065060
2 female_smokers 0.021690
3 male_smokers 0.010329
4 aged_65_older 0.000165
In [372]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[372]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [373]:
country1 = 'Czechia'
country2 = 'Estonia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [374]:
df_updated
Out[374]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
4153 Czechia 3/1/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4154 Czechia 3/2/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4155 Czechia 3/3/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4156 Czechia 3/4/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
4157 Czechia 3/5/2020 6.63 0.900 0.0 32605.906 137.176 10493990 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7306 Estonia 12/26/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7307 Estonia 12/27/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.463645
7308 Estonia 12/28/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423
7309 Estonia 12/29/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423

2095 rows × 9 columns

In [375]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [376]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [377]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[377]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [378]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [379]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [380]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [381]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [382]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[382]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [383]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [384]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [385]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [386]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982992559865769
In [387]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [388]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0013297376906791
R2 Score: 0.9980691997762051
RMSE: 0.036466
Entropy Value: inf
In [389]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[389]:
feature importance
1 human_development_index 0.691546
0 hospital_beds_per_thousand 0.132334
5 population 0.127958
2 extreme_poverty 0.030192
3 gdp_per_capita 0.017370
4 population_density 0.000600
In [390]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[390]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [391]:
country1 = 'United States'
country2 = 'Bulgaria'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [392]:
df_updated
Out[392]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 15.413 38.3 1.084791

2100 rows × 9 columns

In [393]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [394]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [395]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[395]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [396]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [397]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [398]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [399]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [400]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[400]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [401]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [402]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [403]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [404]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9632592414289769
In [405]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [406]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007520814879905908
R2 Score: 0.9962170632858504
RMSE: 0.086723
Entropy Value: 0.0005526318445337031
In [407]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[407]:
feature importance
0 cardiovasc_death_rate 0.332596
1 diabetes_prevalence 0.290038
5 median_age 0.260804
2 female_smokers 0.059976
4 aged_65_older 0.029853
3 male_smokers 0.026733
In [408]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[408]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [409]:
country1 = 'United States'
country2 = 'Bulgaria'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [410]:
df_updated
Out[410]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.770 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.770 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.770 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.770 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.770 0.926 1.2 54225.446 35.608 338289856 1.084791

2100 rows × 9 columns

In [411]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [412]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [413]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[413]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [414]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [415]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [416]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [417]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [418]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[418]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [419]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [420]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [421]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [422]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9557094751056194
In [423]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [424]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01529064233782365
R2 Score: 0.9923088743432271
RMSE: 0.123655
Entropy Value: 0.0009932015791245545
In [425]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[425]:
feature importance
0 hospital_beds_per_thousand 0.355586
5 population 0.256295
1 human_development_index 0.226996
4 population_density 0.069507
2 extreme_poverty 0.056325
3 gdp_per_capita 0.035290
In [426]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[426]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [427]:
country1 = 'Latvia'
country2 = 'Romania'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [428]:
df_updated
Out[428]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
17800 Romania 2/26/2020 370.946 9.74 22.9 37.1 17.850 43.0 0.000000
17801 Romania 2/27/2020 370.946 9.74 22.9 37.1 17.850 43.0 0.000000
17802 Romania 2/28/2020 370.946 9.74 22.9 37.1 17.850 43.0 0.000000
17803 Romania 2/29/2020 370.946 9.74 22.9 37.1 17.850 43.0 0.000000
17804 Romania 3/1/2020 370.946 9.74 22.9 37.1 17.850 43.0 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 19.754 43.9 0.631969

2076 rows × 9 columns

In [429]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [430]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [431]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[431]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [432]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [433]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [434]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [435]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [436]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[436]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [437]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [438]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [439]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [440]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.998693558932106
In [441]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [442]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0019333636112641855
R2 Score: 0.9986831440091684
RMSE: 0.043970
Entropy Value: inf
In [443]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[443]:
feature importance
0 cardiovasc_death_rate 0.488905
1 diabetes_prevalence 0.289292
5 median_age 0.199869
2 female_smokers 0.017471
3 male_smokers 0.003379
4 aged_65_older 0.001083
In [444]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[444]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [445]:
country1 = 'Latvia'
country2 = 'Romania'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [446]:
df_updated
Out[446]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
17800 Romania 2/26/2020 6.892 0.828 5.7 23313.199 85.129 19659270 0.000000
17801 Romania 2/27/2020 6.892 0.828 5.7 23313.199 85.129 19659270 0.000000
17802 Romania 2/28/2020 6.892 0.828 5.7 23313.199 85.129 19659270 0.000000
17803 Romania 2/29/2020 6.892 0.828 5.7 23313.199 85.129 19659270 0.000000
17804 Romania 3/1/2020 6.892 0.828 5.7 23313.199 85.129 19659270 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.570 0.866 0.7 25063.846 31.212 1850654 0.631969

2076 rows × 9 columns

In [447]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [448]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [449]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[449]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [450]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [451]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [452]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [453]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [454]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[454]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [455]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [456]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [457]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [458]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9983819303279453
In [459]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [460]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0020986676904163337
R2 Score: 0.9985705518068159
RMSE: 0.045811
Entropy Value: 0.0002345345172898176
In [461]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[461]:
feature importance
0 hospital_beds_per_thousand 0.672578
1 human_development_index 0.206698
5 population 0.109832
2 extreme_poverty 0.010005
3 gdp_per_capita 0.000671
4 population_density 0.000216
In [462]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[462]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [463]:
country1 = 'Serbia'
country2 = 'Slovakia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [464]:
df_updated
Out[464]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 439.415 10.08 37.7 40.2 17.366 41.2 0.717058
16755 Serbia 12/26/2022 439.415 10.08 37.7 40.2 17.366 41.2 0.716963
16756 Serbia 12/27/2022 439.415 10.08 37.7 40.2 17.366 41.2 0.716677
16757 Serbia 12/28/2022 439.415 10.08 37.7 40.2 17.366 41.2 0.716395
16758 Serbia 12/29/2022 439.415 10.08 37.7 40.2 17.366 41.2 0.716205

2067 rows × 9 columns

In [465]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [466]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [467]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[467]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [468]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [469]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [470]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [471]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [472]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[472]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [473]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [474]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [475]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [476]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9958533450047872
In [477]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [478]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0008290251329329237
R2 Score: 0.996607231553571
RMSE: 0.028793
Entropy Value: inf
In [479]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[479]:
feature importance
1 diabetes_prevalence 0.667632
0 cardiovasc_death_rate 0.144057
5 median_age 0.127156
2 female_smokers 0.026975
4 aged_65_older 0.021287
3 male_smokers 0.012893
In [480]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[480]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [481]:
country1 = 'Serbia'
country2 = 'Slovakia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [482]:
df_updated
Out[482]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.717058
16755 Serbia 12/26/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716963
16756 Serbia 12/27/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716677
16757 Serbia 12/28/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716395
16758 Serbia 12/29/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716205

2067 rows × 9 columns

In [483]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [484]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [485]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[485]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [486]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [487]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [488]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [489]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [490]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[490]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [491]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [492]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [493]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [494]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9967966965293827
In [495]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [496]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0008619570126196017
R2 Score: 0.9964724585076836
RMSE: 0.029359
Entropy Value: inf
In [497]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[497]:
feature importance
1 human_development_index 0.429013
0 hospital_beds_per_thousand 0.351790
5 population 0.185761
2 extreme_poverty 0.022991
3 gdp_per_capita 0.005310
4 population_density 0.005134
In [2]:
# Country Pair by Pair Analysis relative to aged_65_older
In [3]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[3]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [4]:
# Showing the pairings of countries based on aged_65_older (13 pairs of countries)
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Finland = df[(df.location == "Finland")]

df_Italy = df[(df.location == "Italy")]
df_Portugal = df[(df.location == "Portugal")]

df_Sweden = df[(df.location == "Sweden")]
df_Austria = df[(df.location == "Austria")]

df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]

df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]

df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]

df_Latvia = df[(df.location == "Latvia")]
df_Netherlands = df[(df.location == "Netherlands")]

df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]

df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]

df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Slovakia = df[(df.location == "Slovakia")]
df_UnitedStates = df[(df.location == "United States")]
In [5]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [6]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [7]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[7]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [8]:
country1 = 'Bulgaria'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [9]:
df_updated
Out[9]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 44.7 14.285714
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 153.507 5.76 18.3 22.6 81.91 42.8 0.551590
8372 Finland 12/26/2022 153.507 5.76 18.3 22.6 81.91 42.8 0.551590
8373 Finland 12/27/2022 153.507 5.76 18.3 22.6 81.91 42.8 0.551590
8374 Finland 12/28/2022 153.507 5.76 18.3 22.6 81.91 42.8 0.551590
8375 Finland 12/29/2022 153.507 5.76 18.3 22.6 81.91 42.8 0.551590

2093 rows × 9 columns

In [10]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [11]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [12]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[12]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [13]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [14]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [15]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [16]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [17]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[17]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [18]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [19]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [20]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [21]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9667879314777128
In [22]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [23]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0047703788248759415
R2 Score: 0.9980732819098217
RMSE: 0.069068
Entropy Value: 0.0006378391950056142
In [24]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[24]:
feature importance
5 median_age 0.374058
0 cardiovasc_death_rate 0.360130
1 diabetes_prevalence 0.179878
3 male_smokers 0.033397
2 female_smokers 0.031643
4 life_expectancy 0.020894
In [25]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[25]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [26]:
country1 = 'Bulgaria'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [27]:
df_updated
Out[27]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.50 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.50 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.50 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.50 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.50 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.280 0.938 0.04 40585.721 18.136 5540745 0.551590
8372 Finland 12/26/2022 3.280 0.938 0.04 40585.721 18.136 5540745 0.551590
8373 Finland 12/27/2022 3.280 0.938 0.04 40585.721 18.136 5540745 0.551590
8374 Finland 12/28/2022 3.280 0.938 0.04 40585.721 18.136 5540745 0.551590
8375 Finland 12/29/2022 3.280 0.938 0.04 40585.721 18.136 5540745 0.551590

2093 rows × 9 columns

In [28]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [29]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [30]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[30]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [31]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [32]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [33]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [34]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [35]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[35]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [36]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [37]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [38]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [39]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.965382393805797
In [40]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [41]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005470667380413319
R2 Score: 0.9977904409284591
RMSE: 0.073964
Entropy Value: 0.0007247508478906556
In [42]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[42]:
feature importance
0 hospital_beds_per_thousand 0.657479
5 population 0.175003
1 human_development_index 0.122471
2 extreme_poverty 0.022564
3 gdp_per_capita 0.011838
4 population_density 0.010644
In [43]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[43]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [44]:
country1 = 'Italy'
country2 = 'Portugal'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [45]:
df_updated
Out[45]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 82.05 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 82.05 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 82.05 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 82.05 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 82.05 46.2 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 47.9 0.735109

2098 rows × 9 columns

In [46]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [47]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [48]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[48]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [49]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [50]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [51]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [52]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [53]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[53]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [54]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [55]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [56]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [57]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9993592153995753
In [58]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [59]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.020270373691906065
R2 Score: 0.9982206997530265
RMSE: 0.142374
Entropy Value: 0.0009509974446059138
In [60]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[60]:
feature importance
1 diabetes_prevalence 0.614331
0 cardiovasc_death_rate 0.304136
2 female_smokers 0.042708
5 median_age 0.038273
3 male_smokers 0.000499
4 life_expectancy 0.000053
In [61]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[61]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [62]:
country1 = 'Italy'
country2 = 'Portugal'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [63]:
df_updated
Out[63]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109

2098 rows × 9 columns

In [64]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [65]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [66]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[66]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [67]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [68]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [69]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [70]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [71]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[71]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [72]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [73]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [74]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [75]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9992988336861244
In [76]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [77]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.013594354001948517
R2 Score: 0.9988067098416261
RMSE: 0.116595
Entropy Value: 0.0007082932658725094
In [78]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[78]:
feature importance
5 population 0.526697
1 human_development_index 0.421271
2 extreme_poverty 0.034415
0 hospital_beds_per_thousand 0.010278
3 gdp_per_capita 0.007127
4 population_density 0.000212
In [79]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[79]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [80]:
country1 = 'Sweden'
country2 = 'Austria'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [81]:
df_updated
Out[81]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 44.4 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 18.9 82.80 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 18.9 82.80 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 18.9 82.80 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 18.9 82.80 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 18.9 82.80 41.0 0.816005

2102 rows × 9 columns

In [82]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [83]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [84]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[84]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [85]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [86]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [87]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [88]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [89]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[89]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [90]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [91]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [92]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [93]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9986828973635486
In [94]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [95]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01245250411538312
R2 Score: 0.9973372856257704
RMSE: 0.111591
Entropy Value: 0.0006474195228361843
In [96]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[96]:
feature importance
0 cardiovasc_death_rate 0.397694
5 median_age 0.349627
1 diabetes_prevalence 0.226924
2 female_smokers 0.017465
4 life_expectancy 0.004970
3 male_smokers 0.003319
In [97]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[97]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [98]:
country1 = 'Sweden'
country2 = 'Austria'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [99]:
df_updated
Out[99]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.816005

2102 rows × 9 columns

In [100]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [101]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [102]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[102]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [103]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [104]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [105]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [106]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [107]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[107]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [108]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [109]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [110]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [111]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9984810446105193
In [112]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [113]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.014427701442901309
R2 Score: 0.9969149299078208
RMSE: 0.120115
Entropy Value: 0.000925590220887666
In [114]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[114]:
feature importance
1 human_development_index 0.475411
0 hospital_beds_per_thousand 0.224173
2 extreme_poverty 0.198854
5 population 0.080118
3 gdp_per_capita 0.020331
4 population_density 0.001113
In [115]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[115]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [116]:
country1 = 'Belgium'
country2 = 'Canada'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [117]:
df_updated
Out[117]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 25.1 31.4 81.63 41.8 0.000000
1040 Belgium 2/5/2020 114.898 4.29 25.1 31.4 81.63 41.8 0.000000
1041 Belgium 2/6/2020 114.898 4.29 25.1 31.4 81.63 41.8 0.000000
1042 Belgium 2/7/2020 114.898 4.29 25.1 31.4 81.63 41.8 0.000000
1043 Belgium 2/8/2020 114.898 4.29 25.1 31.4 81.63 41.8 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 41.4 1.093162

2132 rows × 9 columns

In [118]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [119]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [120]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[120]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [121]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [122]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [123]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [124]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [125]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[125]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [126]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [127]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [128]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [129]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9990497455346172
In [130]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [131]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.016727148283166463
R2 Score: 0.9987351396841938
RMSE: 0.129333
Entropy Value: 0.0004295927143185571
In [132]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[132]:
feature importance
1 diabetes_prevalence 0.908673
0 cardiovasc_death_rate 0.053697
5 median_age 0.021604
2 female_smokers 0.012787
3 male_smokers 0.003085
4 life_expectancy 0.000153
In [133]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[133]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [134]:
country1 = 'Belgium'
country2 = 'Canada'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [135]:
df_updated
Out[135]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.093162

2132 rows × 9 columns

In [136]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [137]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [138]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[138]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [139]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [140]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [141]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [142]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [143]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[143]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [144]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [145]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [146]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [147]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982635754420282
In [148]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [149]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.021593332086563184
R2 Score: 0.9983671724325057
RMSE: 0.146947
Entropy Value: 0.0011894401573703628
In [150]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[150]:
feature importance
1 human_development_index 0.854404
0 hospital_beds_per_thousand 0.055223
2 extreme_poverty 0.054972
5 population 0.017190
3 gdp_per_capita 0.014188
4 population_density 0.004023
In [151]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[151]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [152]:
country1 = 'Czechia'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [153]:
df_updated
Out[153]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
4153 Czechia 3/1/2020 227.485 6.82 30.5 38.3 79.38 43.3 0.000000
4154 Czechia 3/2/2020 227.485 6.82 30.5 38.3 79.38 43.3 0.000000
4155 Czechia 3/3/2020 227.485 6.82 30.5 38.3 79.38 43.3 0.000000
4156 Czechia 3/4/2020 227.485 6.82 30.5 38.3 79.38 43.3 0.000000
4157 Czechia 3/5/2020 227.485 6.82 30.5 38.3 79.38 43.3 0.000000
... ... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 114.767 6.41 19.3 18.8 80.90 42.3 0.227772
6245 Denmark 12/26/2022 114.767 6.41 19.3 18.8 80.90 42.3 0.227772
6246 Denmark 12/27/2022 114.767 6.41 19.3 18.8 80.90 42.3 0.228905
6247 Denmark 12/28/2022 114.767 6.41 19.3 18.8 80.90 42.3 0.229131
6248 Denmark 12/29/2022 114.767 6.41 19.3 18.8 80.90 42.3 0.229131

2096 rows × 9 columns

In [154]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [155]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [156]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[156]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [157]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [158]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [159]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [160]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [161]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[161]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [162]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [163]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [164]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [165]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990998156820339
In [166]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [167]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0022554314390671685
R2 Score: 0.9981274828812655
RMSE: 0.047491
Entropy Value: 0.00039819174563969235
In [168]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[168]:
feature importance
1 diabetes_prevalence 0.847823
5 median_age 0.105723
0 cardiovasc_death_rate 0.030798
2 female_smokers 0.014866
3 male_smokers 0.000634
4 life_expectancy 0.000156
In [169]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[169]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [170]:
country1 = 'Czechia'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [171]:
df_updated
Out[171]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
4153 Czechia 3/1/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4154 Czechia 3/2/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4155 Czechia 3/3/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4156 Czechia 3/4/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4157 Czechia 3/5/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
... ... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.227772
6245 Denmark 12/26/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.227772
6246 Denmark 12/27/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.228905
6247 Denmark 12/28/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.229131
6248 Denmark 12/29/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.229131

2096 rows × 9 columns

In [172]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [173]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [174]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[174]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [175]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [176]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [177]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [178]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [179]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[179]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [180]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [181]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [182]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [183]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990802017690633
In [184]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [185]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0018119703339651765
R2 Score: 0.9984956556824479
RMSE: 0.042567
Entropy Value: 0.0003609256883659556
In [186]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[186]:
feature importance
1 human_development_index 0.672643
0 hospital_beds_per_thousand 0.166905
5 population 0.134752
2 extreme_poverty 0.022526
3 gdp_per_capita 0.002956
4 population_density 0.000219
In [187]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[187]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [188]:
country1 = 'Estonia'
country2 = 'France'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [189]:
df_updated
Out[189]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 42.7 0.000000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 42.7 0.000000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 42.7 0.000000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 42.7 0.000000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 42.7 0.000000
... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 86.060 4.77 30.1 35.6 82.66 42.0 0.411710
9443 France 12/26/2022 86.060 4.77 30.1 35.6 82.66 42.0 0.411282
9444 France 12/27/2022 86.060 4.77 30.1 35.6 82.66 42.0 0.411730
9445 France 12/28/2022 86.060 4.77 30.1 35.6 82.66 42.0 0.411813
9446 France 12/29/2022 86.060 4.77 30.1 35.6 82.66 42.0 0.411892

2132 rows × 9 columns

In [190]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [191]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [192]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[192]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [193]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [194]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [195]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [196]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [197]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[197]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [198]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [199]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [200]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [201]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9976163185744944
In [202]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [203]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0627316342415563
R2 Score: 0.9934687538779688
RMSE: 0.250463
Entropy Value: 0.0031520180501137213
In [204]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[204]:
feature importance
1 diabetes_prevalence 0.613007
0 cardiovasc_death_rate 0.318802
5 median_age 0.047053
2 female_smokers 0.012094
3 male_smokers 0.008496
4 life_expectancy 0.000548
In [205]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[205]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [206]:
country1 = 'Estonia'
country2 = 'France'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [207]:
df_updated
Out[207]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6250 Estonia 1/18/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6251 Estonia 2/5/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6252 Estonia 2/6/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
6253 Estonia 2/7/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.000000
... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411710
9443 France 12/26/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411282
9444 France 12/27/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411730
9445 France 12/28/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411813
9446 France 12/29/2022 5.98 0.901 0.02 38605.671 122.578 67813000 0.411892

2132 rows × 9 columns

In [208]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [209]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [210]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[210]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [211]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [212]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [213]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [214]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [215]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[215]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [216]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [217]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [218]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [219]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9978195582971618
In [220]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [221]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.07504882607957856
R2 Score: 0.9921863608333906
RMSE: 0.273950
Entropy Value: 0.004313799679754843
In [222]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[222]:
feature importance
1 human_development_index 0.760584
5 population 0.117334
0 hospital_beds_per_thousand 0.096855
2 extreme_poverty 0.013718
3 gdp_per_capita 0.011203
4 population_density 0.000306
In [223]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[223]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [224]:
country1 = 'Latvia'
country2 = 'Netherlands'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [225]:
df_updated
Out[225]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 43.2 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 75.29 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 75.29 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 75.29 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 75.29 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 75.29 43.9 0.631969

2075 rows × 9 columns

In [226]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [227]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [229]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[229]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [230]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [231]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [232]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [233]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [234]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[234]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [235]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [236]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [237]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [238]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990018484824634
In [239]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [240]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005596727727292748
R2 Score: 0.99924811770952
RMSE: 0.074811
Entropy Value: 0.0003576869080799259
In [241]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[241]:
feature importance
1 diabetes_prevalence 0.832180
0 cardiovasc_death_rate 0.135251
2 female_smokers 0.017563
5 median_age 0.014338
3 male_smokers 0.000591
4 life_expectancy 0.000076
In [242]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[242]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [243]:
country1 = 'Latvia'
country2 = 'Netherlands'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [244]:
df_updated
Out[244]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631969

2075 rows × 9 columns

In [245]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [246]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [247]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[247]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [248]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [249]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [250]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [251]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [252]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[252]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [253]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [254]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [255]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [256]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988456921162413
In [257]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [258]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005570458953609194
R2 Score: 0.9992516467405338
RMSE: 0.074636
Entropy Value: 0.0006602651523713767
In [259]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[259]:
feature importance
1 human_development_index 0.646902
0 hospital_beds_per_thousand 0.161196
2 extreme_poverty 0.124314
5 population 0.045094
3 gdp_per_capita 0.019807
4 population_density 0.002686
In [260]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[260]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [261]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [262]:
df_updated
Out[262]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
15721 Serbia 2/26/2020 439.415 10.08 37.7 40.2 76.00 41.2 0.000000
15722 Serbia 2/27/2020 439.415 10.08 37.7 40.2 76.00 41.2 0.000000
15723 Serbia 2/28/2020 439.415 10.08 37.7 40.2 76.00 41.2 0.000000
15724 Serbia 2/29/2020 439.415 10.08 37.7 40.2 76.00 41.2 0.000000
15725 Serbia 3/1/2020 439.415 10.08 37.7 40.2 76.00 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 37.1 76.05 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 37.1 76.05 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 37.1 76.05 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 37.1 76.05 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 37.1 76.05 43.0 2.036403

2076 rows × 9 columns

In [263]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [264]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [265]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[265]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [266]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [267]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [268]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [269]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [270]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[270]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [271]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [272]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [273]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [274]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9983847769546339
In [275]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [276]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0017588024130940267
R2 Score: 0.9989760104365151
RMSE: 0.041938
Entropy Value: 0.00044469540288006774
In [277]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[277]:
feature importance
5 median_age 0.625635
0 cardiovasc_death_rate 0.244771
1 diabetes_prevalence 0.109839
2 female_smokers 0.010393
3 male_smokers 0.008904
4 life_expectancy 0.000458
In [278]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[278]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [279]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [280]:
df_updated
Out[280]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
15721 Serbia 2/26/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15722 Serbia 2/27/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15723 Serbia 2/28/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15724 Serbia 2/29/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15725 Serbia 3/1/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403

2076 rows × 9 columns

In [281]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [282]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [283]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[283]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [284]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [285]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [286]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [287]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [288]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[288]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [289]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [290]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [291]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [292]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979474526774726
In [293]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [294]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002076185405706244
R2 Score: 0.9987912273877525
RMSE: 0.045565
Entropy Value: 0.00045853981129780964
In [295]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[295]:
feature importance
5 population 0.663901
0 hospital_beds_per_thousand 0.186232
1 human_development_index 0.124847
2 extreme_poverty 0.013139
3 gdp_per_capita 0.011260
4 population_density 0.000621
In [296]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[296]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [297]:
country1 = 'Slovenia'
country2 = 'Spain'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [298]:
df_updated
Out[298]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
24074 Spain 2/1/2020 99.403 7.17 27.4 31.4 83.56 45.5 0.000000
24075 Spain 2/2/2020 99.403 7.17 27.4 31.4 83.56 45.5 0.000000
24076 Spain 2/3/2020 99.403 7.17 27.4 31.4 83.56 45.5 0.000000
24077 Spain 2/4/2020 99.403 7.17 27.4 31.4 83.56 45.5 0.000000
24078 Spain 2/5/2020 99.403 7.17 27.4 31.4 83.56 45.5 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 44.5 0.536669

2125 rows × 9 columns

In [299]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [300]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [301]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[301]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [302]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [303]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [304]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [305]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [306]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[306]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [307]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [308]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [309]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [310]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987609289639716
In [311]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [312]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.007138550334511055
R2 Score: 0.9989051784209579
RMSE: 0.084490
Entropy Value: 0.0006678903177231346
In [313]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[313]:
feature importance
1 diabetes_prevalence 0.819366
0 cardiovasc_death_rate 0.131678
2 female_smokers 0.037593
5 median_age 0.009186
3 male_smokers 0.001930
4 life_expectancy 0.000246
In [314]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[314]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [315]:
country1 = 'Slovenia'
country2 = 'Spain'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [316]:
df_updated
Out[316]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
24074 Spain 2/1/2020 2.97 0.904 1.0 34272.36 93.105 47558632 0.000000
24075 Spain 2/2/2020 2.97 0.904 1.0 34272.36 93.105 47558632 0.000000
24076 Spain 2/3/2020 2.97 0.904 1.0 34272.36 93.105 47558632 0.000000
24077 Spain 2/4/2020 2.97 0.904 1.0 34272.36 93.105 47558632 0.000000
24078 Spain 2/5/2020 2.97 0.904 1.0 34272.36 93.105 47558632 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.84 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.84 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.84 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.84 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.84 102.619 2119843 0.536669

2125 rows × 9 columns

In [317]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [318]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [319]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[319]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [320]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [321]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [322]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [323]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [324]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[324]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [325]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [326]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [327]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [328]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9976196824477601
In [329]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [330]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009459864855929317
R2 Score: 0.9985491642289018
RMSE: 0.097262
Entropy Value: 0.0010173017666235732
In [331]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[331]:
feature importance
1 human_development_index 0.731289
2 extreme_poverty 0.139555
0 hospital_beds_per_thousand 0.068121
5 population 0.044294
3 gdp_per_capita 0.014539
4 population_density 0.002202
In [332]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[332]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [333]:
country1 = 'Switzerland'
country2 = 'United Kingdom'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [334]:
df_updated
Out[334]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 81.32 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 81.32 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 81.32 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 81.32 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 81.32 40.8 22.222222
... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 99.739 5.59 22.6 28.9 83.78 43.1 0.322922
14645 Switzerland 12/26/2022 99.739 5.59 22.6 28.9 83.78 43.1 0.322922
14646 Switzerland 12/27/2022 99.739 5.59 22.6 28.9 83.78 43.1 0.322922
14647 Switzerland 12/28/2022 99.739 5.59 22.6 28.9 83.78 43.1 0.323082
14648 Switzerland 12/29/2022 99.739 5.59 22.6 28.9 83.78 43.1 0.322149

2102 rows × 9 columns

In [335]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [336]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [337]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[337]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [338]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [339]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [340]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [341]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [342]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[342]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [343]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [344]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [345]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [346]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9510892546005006
In [347]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [348]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  1.7690674535543163
R2 Score: 0.93014509580057
RMSE: 1.330063
Entropy Value: 0.007763981883565692
In [349]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[349]:
feature importance
0 cardiovasc_death_rate 0.452751
1 diabetes_prevalence 0.267413
2 female_smokers 0.133603
4 life_expectancy 0.081354
5 median_age 0.041533
3 male_smokers 0.023346
In [350]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[350]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [351]:
country1 = 'Switzerland'
country2 = 'United Kingdom'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [352]:
df_updated
Out[352]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.20 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.20 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.20 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.20 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.20 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322922
14645 Switzerland 12/26/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322922
14646 Switzerland 12/27/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322922
14647 Switzerland 12/28/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.323082
14648 Switzerland 12/29/2022 4.53 0.955 0.03 57410.166 214.243 8740471 0.322149

2102 rows × 9 columns

In [353]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [354]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [355]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[355]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [356]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [357]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [358]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [359]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [360]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[360]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [361]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [362]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [363]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [364]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.952952674424612
In [365]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [366]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.8303105819555134
R2 Score: 0.9672136491789828
RMSE: 0.911214
Entropy Value: 0.006240288209575662
In [367]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[367]:
feature importance
5 population 0.314746
1 human_development_index 0.291846
2 extreme_poverty 0.138111
0 hospital_beds_per_thousand 0.116382
4 population_density 0.079420
3 gdp_per_capita 0.059494
In [368]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[368]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [369]:
country1 = 'Cyprus'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [370]:
df_updated
Out[370]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 37.3 0.00000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 37.3 0.00000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 37.3 0.00000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 37.3 0.00000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 37.3 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 37.3 0.11011

2063 rows × 9 columns

In [371]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [372]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [373]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[373]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [374]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [375]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [376]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [377]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [378]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[378]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [379]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [380]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [381]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [382]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9884723572153133
In [383]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [384]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0004166879516503694
R2 Score: 0.9978755777516257
RMSE: 0.020413
Entropy Value: 0.0007951400229065907
In [385]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[385]:
feature importance
1 diabetes_prevalence 0.625744
0 cardiovasc_death_rate 0.234341
5 median_age 0.057504
3 male_smokers 0.032153
2 female_smokers 0.027268
4 life_expectancy 0.022991
In [386]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[386]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [387]:
country1 = 'Cyprus'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [388]:
df_updated
Out[388]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3127 Cyprus 3/9/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3128 Cyprus 3/10/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3129 Cyprus 3/11/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3130 Cyprus 3/12/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011

2063 rows × 9 columns

In [389]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [390]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [391]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[391]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [392]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [393]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [394]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [395]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [396]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[396]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [397]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [398]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [399]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [400]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.988544640680141
In [401]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [402]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0009590270117916058
R2 Score: 0.9951105418033505
RMSE: 0.030968
Entropy Value: 0.0017192056737414868
In [403]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[403]:
feature importance
1 human_development_index 0.542660
0 hospital_beds_per_thousand 0.171023
2 extreme_poverty 0.106136
3 gdp_per_capita 0.073260
4 population_density 0.055552
5 population 0.051370
In [404]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[404]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [405]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [406]:
df_updated
Out[406]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
16759 Luxembourg 2/12/2020 128.275 4.42 20.9 26.0 82.25 39.7 0.000000
16760 Luxembourg 2/24/2020 128.275 4.42 20.9 26.0 82.25 39.7 0.000000
16761 Luxembourg 2/25/2020 128.275 4.42 20.9 26.0 82.25 39.7 0.000000
16762 Luxembourg 2/26/2020 128.275 4.42 20.9 26.0 82.25 39.7 0.000000
16763 Luxembourg 2/27/2020 128.275 4.42 20.9 26.0 82.25 39.7 0.000000
... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 126.459 3.28 23.0 25.7 82.30 38.7 0.491388
19869 Ireland 12/26/2022 126.459 3.28 23.0 25.7 82.30 38.7 0.491388
19870 Ireland 12/27/2022 126.459 3.28 23.0 25.7 82.30 38.7 0.491388
19871 Ireland 12/28/2022 126.459 3.28 23.0 25.7 82.30 38.7 0.491388
19872 Ireland 12/29/2022 126.459 3.28 23.0 25.7 82.30 38.7 0.491388

2076 rows × 9 columns

In [407]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [408]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [409]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[409]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [410]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [411]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [412]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [413]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [414]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[414]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [415]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [416]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [417]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [418]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979194282791959
In [419]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [420]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002869722959012224
R2 Score: 0.9987432806147093
RMSE: 0.053570
Entropy Value: 0.0005898323283403529
In [421]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[421]:
feature importance
0 cardiovasc_death_rate 0.668046
1 diabetes_prevalence 0.137005
3 male_smokers 0.069868
2 female_smokers 0.062073
5 median_age 0.061625
4 life_expectancy 0.001384
In [422]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[422]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [423]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [424]:
df_updated
Out[424]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
16759 Luxembourg 2/12/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19869 Ireland 12/26/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19870 Ireland 12/27/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19871 Ireland 12/28/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19872 Ireland 12/29/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388

2076 rows × 9 columns

In [425]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [426]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [427]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[427]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [428]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [429]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [430]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [431]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [432]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[432]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [433]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [434]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [435]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [436]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986051830769147
In [437]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [438]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0015122598797065847
R2 Score: 0.9993377457219498
RMSE: 0.038888
Entropy Value: 0.0003022118010805689
In [439]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[439]:
feature importance
5 population 0.715384
1 human_development_index 0.205967
0 hospital_beds_per_thousand 0.052834
2 extreme_poverty 0.024062
3 gdp_per_capita 0.001458
4 population_density 0.000295
In [440]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[440]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [441]:
country1 = 'Slovakia'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [442]:
df_updated
Out[442]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 38.3 1.084791

2102 rows × 9 columns

In [443]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [444]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [445]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[445]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [446]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [447]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [448]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [449]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [450]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[450]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [451]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [452]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [453]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [454]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9878328845258098
In [455]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [456]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00894710610093598
R2 Score: 0.9915900687687879
RMSE: 0.094589
Entropy Value: inf
In [457]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[457]:
feature importance
1 diabetes_prevalence 0.468440
0 cardiovasc_death_rate 0.277746
5 median_age 0.202056
2 female_smokers 0.031399
4 life_expectancy 0.011402
3 male_smokers 0.008956
In [458]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[458]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [459]:
country1 = 'Slovakia'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [460]:
df_updated
Out[460]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791

2102 rows × 9 columns

In [461]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [462]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [463]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[463]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [464]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [465]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [466]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [467]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [468]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[468]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [469]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [470]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [471]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [472]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.988734157660151
In [473]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [474]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0038799006752455517
R2 Score: 0.9963530444934217
RMSE: 0.062289
Entropy Value: inf
In [475]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[475]:
feature importance
5 population 0.383000
1 human_development_index 0.344298
0 hospital_beds_per_thousand 0.134705
3 gdp_per_capita 0.067241
2 extreme_poverty 0.057677
4 population_density 0.013079
In [2]:
# Country Pair by Pair Analysis relative to diabetes prevalence
In [3]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[3]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [4]:
# Showing the pairings of countries based on diabetes prevalence (13 pairs of countries)
df_Belgium = df[(df.location == "Belgium")]
df_Estonia = df[(df.location == "Estonia")]

df_France = df[(df.location == "France")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]

df_Latvia = df[(df.location == "Latvia")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Sweden = df[(df.location == "Sweden")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Austria = df[(df.location == "Austria")]

df_Bulgaria = df[(df.location == "Bulgaria")]
df_Czechia = df[(df.location == "Czechia")]

df_Denmark = df[(df.location == "Denmark")]
df_Finland = df[(df.location == "Finland")]

df_Switzerland = df[(df.location == "Switzerland")]
df_Canada = df[(df.location == "Canada")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Portugal = df[(df.location == "Portugal")]

df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]

df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Spain = df[(df.location == "Spain")]
df_UnitedStates = df[(df.location == "United States")]
In [5]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [6]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [7]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[7]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [8]:
country1 = 'Belgium'
country2 = 'Estonia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [10]:
df_updated
Out[10]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 25.1 31.4 81.63 18.571 41.8 0.000000
1040 Belgium 2/5/2020 114.898 25.1 31.4 81.63 18.571 41.8 0.000000
1041 Belgium 2/6/2020 114.898 25.1 31.4 81.63 18.571 41.8 0.000000
1042 Belgium 2/7/2020 114.898 25.1 31.4 81.63 18.571 41.8 0.000000
1043 Belgium 2/8/2020 114.898 25.1 31.4 81.63 18.571 41.8 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 255.569 24.5 39.3 78.74 19.452 42.7 0.464100
7306 Estonia 12/26/2022 255.569 24.5 39.3 78.74 19.452 42.7 0.464100
7307 Estonia 12/27/2022 255.569 24.5 39.3 78.74 19.452 42.7 0.463645
7308 Estonia 12/28/2022 255.569 24.5 39.3 78.74 19.452 42.7 0.466423
7309 Estonia 12/29/2022 255.569 24.5 39.3 78.74 19.452 42.7 0.466423

2121 rows × 9 columns

In [11]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [12]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [13]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[13]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [14]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [15]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [16]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [17]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [18]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[18]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [19]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [20]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [21]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [22]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9989949619386504
In [23]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [24]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.012575343517403465
R2 Score: 0.9990037276216706
RMSE: 0.112140
Entropy Value: 0.0005772386498847016
In [25]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[25]:
feature importance
1 female_smokers 0.786635
0 cardiovasc_death_rate 0.176158
2 male_smokers 0.024730
5 median_age 0.010989
3 life_expectancy 0.001396
4 aged_65_older 0.000091
In [26]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[26]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [27]:
country1 = 'Belgium'
country2 = 'Estonia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [28]:
df_updated
Out[28]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7306 Estonia 12/26/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.464100
7307 Estonia 12/27/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.463645
7308 Estonia 12/28/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423
7309 Estonia 12/29/2022 4.69 0.892 0.5 29481.252 31.033 1326064 0.466423

2121 rows × 9 columns

In [29]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [30]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [31]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[31]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [32]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [33]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [34]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [35]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [36]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[36]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [37]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [38]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [39]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [40]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985104636986213
In [41]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [42]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.012715196293674761
R2 Score: 0.9989926478879171
RMSE: 0.112762
Entropy Value: 0.0008232624023628059
In [43]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[43]:
feature importance
1 human_development_index 0.744249
0 hospital_beds_per_thousand 0.102687
2 extreme_poverty 0.100773
5 population 0.041484
3 gdp_per_capita 0.010488
4 population_density 0.000319
In [55]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[55]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [56]:
country1 = 'France'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [57]:
df_updated
Out[57]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
8376 France 1/24/2020 86.060 30.1 35.6 82.66 19.718 42.0 0.00000
8377 France 1/25/2020 86.060 30.1 35.6 82.66 19.718 42.0 0.00000
8378 France 1/26/2020 86.060 30.1 35.6 82.66 19.718 42.0 0.00000
8379 France 1/27/2020 86.060 30.1 35.6 82.66 19.718 42.0 0.00000
8380 France 1/28/2020 86.060 30.1 35.6 82.66 19.718 42.0 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 14.3 15.2 82.99 14.431 37.3 0.11011

2107 rows × 9 columns

In [58]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [59]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [60]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[60]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [61]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [62]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [63]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [64]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [65]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[65]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [66]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [67]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [68]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [69]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9952822274580472
In [70]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [71]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.04901676742466076
R2 Score: 0.9961022286797971
RMSE: 0.221397
Entropy Value: 0.002171781112379247
In [72]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[72]:
feature importance
1 female_smokers 0.806289
0 cardiovasc_death_rate 0.144819
5 median_age 0.021652
2 male_smokers 0.010381
3 life_expectancy 0.008614
4 aged_65_older 0.008244
In [73]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[73]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [74]:
country1 = 'France'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [75]:
df_updated
Out[75]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
8376 France 1/24/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8377 France 1/25/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8378 France 1/26/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8379 France 1/27/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
8380 France 1/28/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011

2107 rows × 9 columns

In [76]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [77]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [78]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[78]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [79]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [80]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [81]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [82]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [83]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[83]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [84]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [85]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [86]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [87]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9946715554617531
In [88]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [89]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.06349754860870421
R2 Score: 0.9949507293753999
RMSE: 0.251987
Entropy Value: 0.002273841175109527
In [90]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[90]:
feature importance
1 human_development_index 0.739557
0 hospital_beds_per_thousand 0.133697
4 population_density 0.060550
5 population 0.031155
3 gdp_per_capita 0.017913
2 extreme_poverty 0.017128
In [91]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[91]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [92]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [93]:
df_updated
Out[93]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 126.459 23.0 25.7 82.30 13.928 38.7 0.000000
18839 Ireland 3/1/2020 126.459 23.0 25.7 82.30 13.928 38.7 0.000000
18840 Ireland 3/2/2020 126.459 23.0 25.7 82.30 13.928 38.7 0.000000
18841 Ireland 3/3/2020 126.459 23.0 25.7 82.30 13.928 38.7 0.000000
18842 Ireland 3/4/2020 126.459 23.0 25.7 82.30 13.928 38.7 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 19.8 27.8 83.51 23.021 47.9 0.735109

2099 rows × 9 columns

In [94]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [95]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [96]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[96]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [97]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [98]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [99]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [100]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [101]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[101]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [102]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [103]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [104]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [105]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990989661235357
In [106]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [107]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0062496276503529605
R2 Score: 0.9994865789352944
RMSE: 0.079055
Entropy Value: 0.00034870367258360715
In [108]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[108]:
feature importance
1 female_smokers 0.436515
0 cardiovasc_death_rate 0.419361
5 median_age 0.110731
2 male_smokers 0.031872
3 life_expectancy 0.000899
4 aged_65_older 0.000622
In [109]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[109]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [110]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [111]:
df_updated
Out[111]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
18838 Ireland 2/29/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18839 Ireland 3/1/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18840 Ireland 3/2/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18841 Ireland 3/3/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
18842 Ireland 3/4/2020 2.96 0.955 0.2 67335.293 69.874 5023108 0.000000
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 205.859 59037472 0.735109

2099 rows × 9 columns

In [112]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [113]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [114]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[114]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [115]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [116]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [117]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [118]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [119]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[119]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [120]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [121]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [122]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [123]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988984971748192
In [124]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [125]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009889496543689754
R2 Score: 0.9991875554626719
RMSE: 0.099446
Entropy Value: 0.00044270709478587373
In [126]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[126]:
feature importance
1 human_development_index 0.702587
0 hospital_beds_per_thousand 0.200820
5 population 0.042593
2 extreme_poverty 0.026734
3 gdp_per_capita 0.018469
4 population_density 0.008797
In [127]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[127]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [128]:
country1 = 'Latvia'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [129]:
df_updated
Out[129]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
16759 Luxembourg 2/12/2020 128.275 20.9 26.0 82.25 14.312 39.7 0.000000
16760 Luxembourg 2/24/2020 128.275 20.9 26.0 82.25 14.312 39.7 0.000000
16761 Luxembourg 2/25/2020 128.275 20.9 26.0 82.25 14.312 39.7 0.000000
16762 Luxembourg 2/26/2020 128.275 20.9 26.0 82.25 14.312 39.7 0.000000
16763 Luxembourg 2/27/2020 128.275 20.9 26.0 82.25 14.312 39.7 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 25.6 51.0 75.29 19.754 43.9 0.631969

2079 rows × 9 columns

In [130]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [131]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [132]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[132]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [133]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [134]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [135]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [136]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [137]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[137]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [138]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [139]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [140]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [141]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9981149623768651
In [142]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [143]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0022798086212839955
R2 Score: 0.9942594043941946
RMSE: 0.047747
Entropy Value: 0.0007392045794560473
In [144]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[144]:
feature importance
1 female_smokers 0.846222
0 cardiovasc_death_rate 0.084867
5 median_age 0.044414
2 male_smokers 0.020585
3 life_expectancy 0.002959
4 aged_65_older 0.000952
In [145]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[145]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [146]:
country1 = 'Latvia'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [147]:
df_updated
Out[147]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
16759 Luxembourg 2/12/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.7 25063.846 31.212 1850654 0.631969

2079 rows × 9 columns

In [148]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [149]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [150]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[150]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [151]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [152]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [153]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [154]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [155]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[155]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [156]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [157]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [158]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [159]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9980463202050925
In [160]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [161]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0019845762199695097
R2 Score: 0.995002804436573
RMSE: 0.044549
Entropy Value: inf
In [162]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[162]:
feature importance
1 human_development_index 0.485630
5 population 0.260661
0 hospital_beds_per_thousand 0.204685
2 extreme_poverty 0.042192
3 gdp_per_capita 0.005108
4 population_density 0.001724
In [163]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[163]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [164]:
country1 = 'Netherlands'
country2 = 'Sweden'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [165]:
df_updated
Out[165]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 18.8 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 18.8 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 18.8 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 18.8 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 18.8 18.9 82.80 19.985 41.0 0.816005

2100 rows × 9 columns

In [166]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [167]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [168]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[168]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [169]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [170]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [171]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [172]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [173]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[173]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [174]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [175]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [176]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [177]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991742414438469
In [178]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [179]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.011924917352362436
R2 Score: 0.9988595164779729
RMSE: 0.109201
Entropy Value: 0.0006866784019283522
In [180]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[180]:
feature importance
1 female_smokers 0.737708
2 male_smokers 0.199388
0 cardiovasc_death_rate 0.035390
3 life_expectancy 0.017898
5 median_age 0.007344
4 aged_65_older 0.002271
In [181]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[181]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [182]:
country1 = 'Netherlands'
country2 = 'Sweden'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [183]:
df_updated
Out[183]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.5 46949.283 24.718 10549349 0.816005

2100 rows × 9 columns

In [184]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [185]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [186]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[186]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [187]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [188]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [189]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [190]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [191]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[191]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [192]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [193]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [194]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [195]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992006206155605
In [196]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [197]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.015468257535200518
R2 Score: 0.9985206360503729
RMSE: 0.124371
Entropy Value: 0.0009440623489042599
In [198]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[198]:
feature importance
1 human_development_index 0.669177
2 extreme_poverty 0.171464
0 hospital_beds_per_thousand 0.112389
5 population 0.024007
3 gdp_per_capita 0.019415
4 population_density 0.003549
In [203]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[203]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [8]:
country1 = 'United Kingdom'
country2 = 'Austria'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [9]:
df_updated
Out[9]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ...
13605 United Kingdom 12/25/2022 122.137 20.0 24.7 81.32 18.517 40.8 0.883564
13606 United Kingdom 12/26/2022 122.137 20.0 24.7 81.32 18.517 40.8 0.883564
13607 United Kingdom 12/27/2022 122.137 20.0 24.7 81.32 18.517 40.8 0.883564
13608 United Kingdom 12/28/2022 122.137 20.0 24.7 81.32 18.517 40.8 0.883564
13609 United Kingdom 12/29/2022 122.137 20.0 24.7 81.32 18.517 40.8 0.883564

2102 rows × 9 columns

In [10]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [11]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [12]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[12]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [13]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [14]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [15]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [16]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [17]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[17]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [18]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [19]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [20]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [21]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.982683505807579
In [22]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [23]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  5.700044545743729
R2 Score: 0.8519894967019457
RMSE: 2.387477
Entropy Value: 0.01428821441905026
In [24]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[24]:
feature importance
5 median_age 0.575495
0 cardiovasc_death_rate 0.331887
1 female_smokers 0.046679
2 male_smokers 0.020746
4 aged_65_older 0.017492
3 life_expectancy 0.007702
In [25]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[25]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [26]:
country1 = 'United Kingdom'
country2 = 'Austria'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [27]:
df_updated
Out[27]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
13605 United Kingdom 12/25/2022 2.54 0.932 0.2 39753.244 272.898 67508936 0.883564
13606 United Kingdom 12/26/2022 2.54 0.932 0.2 39753.244 272.898 67508936 0.883564
13607 United Kingdom 12/27/2022 2.54 0.932 0.2 39753.244 272.898 67508936 0.883564
13608 United Kingdom 12/28/2022 2.54 0.932 0.2 39753.244 272.898 67508936 0.883564
13609 United Kingdom 12/29/2022 2.54 0.932 0.2 39753.244 272.898 67508936 0.883564

2102 rows × 9 columns

In [28]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [29]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [30]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[30]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [31]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [32]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [33]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [34]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [35]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[35]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [36]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [37]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [38]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [39]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9858107613099282
In [40]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [41]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  5.255425341034955
R2 Score: 0.8635347244167126
RMSE: 2.292471
Entropy Value: 0.012939782263897097
In [42]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[42]:
feature importance
5 population 0.357251
1 human_development_index 0.314325
0 hospital_beds_per_thousand 0.158735
2 extreme_poverty 0.102932
4 population_density 0.044483
3 gdp_per_capita 0.022273
In [239]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[239]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [240]:
country1 = 'Bulgaria'
country2 = 'Czechia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [241]:
df_updated
Out[241]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 227.485 30.5 38.3 79.38 19.027 43.3 0.919258
5183 Czechia 12/26/2022 227.485 30.5 38.3 79.38 19.027 43.3 0.919368
5184 Czechia 12/27/2022 227.485 30.5 38.3 79.38 19.027 43.3 0.919431
5185 Czechia 12/28/2022 227.485 30.5 38.3 79.38 19.027 43.3 0.919430
5186 Czechia 12/29/2022 227.485 30.5 38.3 79.38 19.027 43.3 0.919575

2061 rows × 9 columns

In [242]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [243]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [244]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[244]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [245]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [246]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [247]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [248]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [249]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[249]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [250]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [251]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [252]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [253]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9590037445837588
In [254]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [255]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002678735178965485
R2 Score: 0.9983661769962296
RMSE: 0.051756
Entropy Value: 0.00028650102618085413
In [256]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[256]:
feature importance
0 cardiovasc_death_rate 0.480463
5 median_age 0.405327
2 male_smokers 0.053942
1 female_smokers 0.024489
4 aged_65_older 0.021659
3 life_expectancy 0.014120
In [257]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[257]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [258]:
country1 = 'Bulgaria'
country2 = 'Czechia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [259]:
df_updated
Out[259]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919258
5183 Czechia 12/26/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919368
5184 Czechia 12/27/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919431
5185 Czechia 12/28/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919430
5186 Czechia 12/29/2022 6.630 0.900 0.0 32605.906 137.176 10493990 0.919575

2061 rows × 9 columns

In [260]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [261]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [262]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[262]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [263]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [264]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [265]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [266]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [267]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[267]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [268]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [269]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [270]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [271]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9555145568773449
In [272]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [273]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002769256603078144
R2 Score: 0.9983109658703929
RMSE: 0.052624
Entropy Value: 0.00028051806238002606
In [274]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[274]:
feature importance
0 hospital_beds_per_thousand 0.823913
5 population 0.128464
2 extreme_poverty 0.019683
1 human_development_index 0.016833
4 population_density 0.005945
3 gdp_per_capita 0.005162
In [280]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[280]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [281]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [282]:
df_updated
Out[282]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 19.3 18.8 80.90 19.677 42.3 0.00000
5188 Denmark 2/3/2020 114.767 19.3 18.8 80.90 19.677 42.3 0.00000
5189 Denmark 2/4/2020 114.767 19.3 18.8 80.90 19.677 42.3 0.00000
5190 Denmark 2/5/2020 114.767 19.3 18.8 80.90 19.677 42.3 0.00000
5191 Denmark 2/6/2020 114.767 19.3 18.8 80.90 19.677 42.3 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 153.507 18.3 22.6 81.91 21.228 42.8 0.55159
8372 Finland 12/26/2022 153.507 18.3 22.6 81.91 21.228 42.8 0.55159
8373 Finland 12/27/2022 153.507 18.3 22.6 81.91 21.228 42.8 0.55159
8374 Finland 12/28/2022 153.507 18.3 22.6 81.91 21.228 42.8 0.55159
8375 Finland 12/29/2022 153.507 18.3 22.6 81.91 21.228 42.8 0.55159

2128 rows × 9 columns

In [283]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [284]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [285]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[285]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [286]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [287]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [288]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [289]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [290]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[290]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [291]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [292]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [293]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [294]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987760862696191
In [295]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [296]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008754055895476695
R2 Score: 0.9950339966379534
RMSE: 0.093563
Entropy Value: 0.0017693996942911307
In [297]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[297]:
feature importance
1 female_smokers 0.821181
0 cardiovasc_death_rate 0.071899
2 male_smokers 0.051741
5 median_age 0.027919
3 life_expectancy 0.025571
4 aged_65_older 0.001689
In [298]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[298]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [299]:
country1 = 'Denmark'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [300]:
df_updated
Out[300]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5188 Denmark 2/3/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5189 Denmark 2/4/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5190 Denmark 2/5/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
5191 Denmark 2/6/2020 2.50 0.940 0.20 46682.515 136.520 5882259 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8372 Finland 12/26/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8373 Finland 12/27/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8374 Finland 12/28/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8375 Finland 12/29/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159

2128 rows × 9 columns

In [301]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [302]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [303]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[303]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [304]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [305]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [306]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [307]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [308]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[308]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [309]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [310]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [311]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [312]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984953654596765
In [313]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [314]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008136863131264145
R2 Score: 0.9953841179278682
RMSE: 0.090205
Entropy Value: 0.0015398143907589235
In [315]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[315]:
feature importance
1 human_development_index 0.753936
5 population 0.092143
0 hospital_beds_per_thousand 0.070220
2 extreme_poverty 0.061815
3 gdp_per_capita 0.020882
4 population_density 0.001005
In [316]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[316]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [317]:
country1 = 'Switzerland'
country2 = 'Canada'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [318]:
df_updated
Out[318]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 99.739 22.6 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 99.739 22.6 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 99.739 22.6 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 99.739 22.6 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 99.739 22.6 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 12.0 16.6 82.43 16.984 41.4 1.093162

2111 rows × 9 columns

In [319]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [320]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [321]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[321]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [322]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [323]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [324]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [325]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [326]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[326]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [327]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [328]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [329]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [330]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.999066309456236
In [331]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [332]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010540142156221432
R2 Score: 0.9967890951053929
RMSE: 0.102665
Entropy Value: 0.0015411517323291324
In [333]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[333]:
feature importance
1 female_smokers 0.724506
0 cardiovasc_death_rate 0.158352
5 median_age 0.050841
3 life_expectancy 0.034126
2 male_smokers 0.029129
4 aged_65_older 0.003046
In [334]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[334]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [335]:
country1 = 'Switzerland'
country2 = 'Canada'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [336]:
df_updated
Out[336]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.50 0.929 0.50 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.50 0.929 0.50 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.50 0.929 0.50 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.50 0.929 0.50 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.50 0.929 0.50 44017.591 4.037 38454328 1.093162

2111 rows × 9 columns

In [337]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [338]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [339]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[339]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [340]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [341]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [342]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [343]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [344]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[344]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [345]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [346]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [347]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [348]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989799879827812
In [349]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [350]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008411506866870231
R2 Score: 0.9974375536715211
RMSE: 0.091714
Entropy Value: 0.001265725840506202
In [351]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[351]:
feature importance
1 human_development_index 0.526054
5 population 0.306753
0 hospital_beds_per_thousand 0.071260
3 gdp_per_capita 0.048732
2 extreme_poverty 0.043372
4 population_density 0.003829
In [352]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[352]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [353]:
country1 = 'Cyprus'
country2 = 'Portugal'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [354]:
df_updated
Out[354]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ...
11513 Portugal 12/25/2022 127.842 16.3 30.0 82.05 21.502 46.2 0.462977
11514 Portugal 12/26/2022 127.842 16.3 30.0 82.05 21.502 46.2 0.462977
11515 Portugal 12/27/2022 127.842 16.3 30.0 82.05 21.502 46.2 0.462977
11516 Portugal 12/28/2022 127.842 16.3 30.0 82.05 21.502 46.2 0.462977
11517 Portugal 12/29/2022 127.842 16.3 30.0 82.05 21.502 46.2 0.462977

2061 rows × 9 columns

In [355]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [356]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [357]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[357]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [358]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [359]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [360]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [361]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [362]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[362]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [363]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [364]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [365]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [366]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.997717565913917
In [367]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [368]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0011053104561160806
R2 Score: 0.9988694272398144
RMSE: 0.033246
Entropy Value: 0.0004480598968395122
In [369]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[369]:
feature importance
1 female_smokers 0.437605
0 cardiovasc_death_rate 0.295545
5 median_age 0.225797
2 male_smokers 0.028535
3 life_expectancy 0.008015
4 aged_65_older 0.004504
In [370]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[370]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [371]:
country1 = 'Cyprus'
country2 = 'Portugal'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [372]:
df_updated
Out[372]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ... ...
11513 Portugal 12/25/2022 3.39 0.864 0.50 27936.896 112.371 10270857 0.462977
11514 Portugal 12/26/2022 3.39 0.864 0.50 27936.896 112.371 10270857 0.462977
11515 Portugal 12/27/2022 3.39 0.864 0.50 27936.896 112.371 10270857 0.462977
11516 Portugal 12/28/2022 3.39 0.864 0.50 27936.896 112.371 10270857 0.462977
11517 Portugal 12/29/2022 3.39 0.864 0.50 27936.896 112.371 10270857 0.462977

2061 rows × 9 columns

In [373]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [374]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [375]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[375]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [376]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [377]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [378]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [379]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [380]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[380]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [381]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [382]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [383]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [384]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9972746167510301
In [385]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [386]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0011288091118561118
R2 Score: 0.9988453915130793
RMSE: 0.033598
Entropy Value: 0.00046678790937134003
In [387]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[387]:
feature importance
1 human_development_index 0.410411
5 population 0.387691
0 hospital_beds_per_thousand 0.168443
2 extreme_poverty 0.027509
3 gdp_per_capita 0.003726
4 population_density 0.002220
In [388]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[388]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [389]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [390]:
df_updated
Out[390]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
15721 Serbia 2/26/2020 439.415 37.7 40.2 76.00 17.366 41.2 0.000000
15722 Serbia 2/27/2020 439.415 37.7 40.2 76.00 17.366 41.2 0.000000
15723 Serbia 2/28/2020 439.415 37.7 40.2 76.00 17.366 41.2 0.000000
15724 Serbia 2/29/2020 439.415 37.7 40.2 76.00 17.366 41.2 0.000000
15725 Serbia 3/1/2020 439.415 37.7 40.2 76.00 17.366 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 22.9 37.1 76.05 17.850 43.0 2.037520
18834 Romania 12/26/2022 370.946 22.9 37.1 76.05 17.850 43.0 2.036403
18835 Romania 12/27/2022 370.946 22.9 37.1 76.05 17.850 43.0 2.036403
18836 Romania 12/28/2022 370.946 22.9 37.1 76.05 17.850 43.0 2.036403
18837 Romania 12/29/2022 370.946 22.9 37.1 76.05 17.850 43.0 2.036403

2076 rows × 9 columns

In [391]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [392]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [393]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[393]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [394]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [395]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [396]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [397]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [398]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[398]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [399]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [400]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [401]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [402]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.998541039563188
In [403]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [404]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.001587330310482497
R2 Score: 0.9990758429374236
RMSE: 0.039841
Entropy Value: 0.0004022874798044647
In [405]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[405]:
feature importance
0 cardiovasc_death_rate 0.538776
5 median_age 0.265115
1 female_smokers 0.170741
2 male_smokers 0.017414
3 life_expectancy 0.007693
4 aged_65_older 0.000261
In [406]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[406]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [407]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [408]:
df_updated
Out[408]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
15721 Serbia 2/26/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15722 Serbia 2/27/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15723 Serbia 2/28/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15724 Serbia 2/29/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
15725 Serbia 3/1/2020 5.609 0.806 0.05 14048.881 80.291 6871547 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.70 23313.199 85.129 19659270 2.036403

2076 rows × 9 columns

In [409]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [410]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [411]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[411]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [412]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [413]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [414]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [415]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [416]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[416]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [417]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [418]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [419]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [420]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979474526774726
In [421]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [422]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002076185405706244
R2 Score: 0.9987912273877525
RMSE: 0.045565
Entropy Value: 0.00045853981129780964
In [423]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[423]:
feature importance
5 population 0.663901
0 hospital_beds_per_thousand 0.186232
1 human_development_index 0.124847
2 extreme_poverty 0.013139
3 gdp_per_capita 0.011260
4 population_density 0.000621
In [424]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[424]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [425]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [426]:
df_updated
Out[426]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 23.1 37.7 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 23.1 37.7 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 23.1 37.7 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 23.1 37.7 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 23.1 37.7 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 20.1 25.0 81.32 19.062 44.5 0.536669

2091 rows × 9 columns

In [427]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [428]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [429]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[429]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [430]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [431]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [432]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [433]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [434]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[434]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [435]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [436]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [437]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [438]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9982988581909075
In [439]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [440]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004058159026547636
R2 Score: 0.9980089836798032
RMSE: 0.063704
Entropy Value: 0.0005571201418958172
In [441]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[441]:
feature importance
1 female_smokers 0.740876
0 cardiovasc_death_rate 0.172804
5 median_age 0.065567
2 male_smokers 0.015433
4 aged_65_older 0.004372
3 life_expectancy 0.000948
In [442]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[442]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [443]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [444]:
df_updated
Out[444]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 0.7 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.536669

2091 rows × 9 columns

In [445]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [446]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [447]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[447]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [448]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [449]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [450]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [451]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [452]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[452]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [453]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [454]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [455]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [456]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979685572320796
In [457]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [458]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.010286891805743988
R2 Score: 0.994953039214247
RMSE: 0.101424
Entropy Value: 0.001005217028297138
In [459]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[459]:
feature importance
1 human_development_index 0.658730
5 population 0.288222
2 extreme_poverty 0.024648
0 hospital_beds_per_thousand 0.015281
3 gdp_per_capita 0.007035
4 population_density 0.006085
In [460]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[460]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [461]:
country1 = 'Spain'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [462]:
df_updated
Out[462]:
location date cardiovasc_death_rate female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
24074 Spain 2/1/2020 99.403 27.4 31.4 83.56 19.436 45.5 0.000000
24075 Spain 2/2/2020 99.403 27.4 31.4 83.56 19.436 45.5 0.000000
24076 Spain 2/3/2020 99.403 27.4 31.4 83.56 19.436 45.5 0.000000
24077 Spain 2/4/2020 99.403 27.4 31.4 83.56 19.436 45.5 0.000000
24078 Spain 2/5/2020 99.403 27.4 31.4 83.56 19.436 45.5 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 9 columns

In [463]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [464]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [465]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[465]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [466]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [467]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [468]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [469]:
selected_cols = ['cardiovasc_death_rate', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [470]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[470]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [471]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [472]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [473]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [474]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9980806947075394
In [475]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [476]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.04804061406111574
R2 Score: 0.9919137103360092
RMSE: 0.219182
Entropy Value: 0.0016530566772748438
In [477]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[477]:
feature importance
1 female_smokers 0.728742
0 cardiovasc_death_rate 0.180431
5 median_age 0.065855
2 male_smokers 0.014653
3 life_expectancy 0.010040
4 aged_65_older 0.000279
In [478]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[478]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [479]:
country1 = 'Spain'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [480]:
df_updated
Out[480]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
24074 Spain 2/1/2020 2.97 0.904 1.0 34272.360 93.105 47558632 0.000000
24075 Spain 2/2/2020 2.97 0.904 1.0 34272.360 93.105 47558632 0.000000
24076 Spain 2/3/2020 2.97 0.904 1.0 34272.360 93.105 47558632 0.000000
24077 Spain 2/4/2020 2.97 0.904 1.0 34272.360 93.105 47558632 0.000000
24078 Spain 2/5/2020 2.97 0.904 1.0 34272.360 93.105 47558632 0.000000
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791

2136 rows × 9 columns

In [481]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [482]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [483]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[483]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [484]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [485]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [486]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [487]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [488]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[488]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [489]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [490]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [491]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [492]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9969133378057181
In [493]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [494]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.06838014177054151
R2 Score: 0.9884901214435369
RMSE: 0.261496
Entropy Value: 0.002529999431244677
In [495]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[495]:
feature importance
1 human_development_index 0.552532
0 hospital_beds_per_thousand 0.179562
4 population_density 0.124521
5 population 0.053568
2 extreme_poverty 0.048355
3 gdp_per_capita 0.041462
In [48]:
# Country Pair by Pair Analysis relative to median age
In [49]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[49]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [50]:
# Showing the pairings of countries based on median age (13 pairs of countries)
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Italy = df[(df.location == "Italy")]

df_Portugal = df[(df.location == "Portugal")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Spain = df[(df.location == "Spain")]
df_Austria = df[(df.location == "Austria")]

df_Belgium = df[(df.location == "Belgium")]
df_Canada = df[(df.location == "Canada")]

df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]

df_Estonia = df[(df.location == "Estonia")]
df_Finland = df[(df.location == "Finland")]

df_France = df[(df.location == "France")]
df_Latvia = df[(df.location == "Latvia")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Romania = df[(df.location == "Romania")]

df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]

df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
In [51]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [52]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [53]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[53]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [22]:
country1 = 'Bulgaria'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [23]:
df_updated
Out[23]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 20.801 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 20.801 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 20.801 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 20.801 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 20.801 14.285714
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 0.735109

2091 rows × 9 columns

In [24]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [25]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [26]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[26]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [27]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [28]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [29]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [30]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [31]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[31]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [32]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [33]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [34]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [35]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9904546220941783
In [36]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [37]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01619619055558157
R2 Score: 0.9984776146320379
RMSE: 0.127264
Entropy Value: 0.0003811524578491067
In [38]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[38]:
feature importance
5 aged_65_older 0.511189
1 diabetes_prevalence 0.460899
2 female_smokers 0.013441
0 cardiovasc_death_rate 0.008192
4 life_expectancy 0.004344
3 male_smokers 0.001936
In [39]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[39]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [40]:
country1 = 'Bulgaria'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [41]:
df_updated
Out[41]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.180 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.180 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.180 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.180 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.180 0.892 2.0 35220.084 205.859 59037472 0.735109

2091 rows × 9 columns

In [42]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [43]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [44]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[44]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [45]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [46]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [47]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [48]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [49]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[49]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [50]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [51]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [52]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [53]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9922672063895973
In [54]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [55]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.015206168226033538
R2 Score: 0.998570673274642
RMSE: 0.123313
Entropy Value: 0.0005836511271442241
In [56]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[56]:
feature importance
1 human_development_index 0.643511
5 population 0.194207
0 hospital_beds_per_thousand 0.129965
2 extreme_poverty 0.022751
4 population_density 0.006924
3 gdp_per_capita 0.002641
In [57]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[57]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [58]:
country1 = 'Portugal'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [59]:
df_updated
Out[59]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 82.05 21.502 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 82.05 21.502 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 82.05 21.502 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 82.05 21.502 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 82.05 21.502 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 0.536669

2096 rows × 9 columns

In [60]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [61]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [62]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[62]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [63]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [64]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [65]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [66]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [67]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[67]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [68]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [69]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [70]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [71]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985113247826553
In [72]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [73]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002853509898203111
R2 Score: 0.9985715941766472
RMSE: 0.053418
Entropy Value: 0.00029581235478226195
In [74]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[74]:
feature importance
1 diabetes_prevalence 0.861019
0 cardiovasc_death_rate 0.088271
3 male_smokers 0.020801
2 female_smokers 0.020747
5 aged_65_older 0.008471
4 life_expectancy 0.000691
In [75]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[75]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [76]:
country1 = 'Portugal'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [77]:
df_updated
Out[77]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 102.619 2119843 0.536669

2096 rows × 9 columns

In [78]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [79]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [80]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[80]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [81]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [82]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [83]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [84]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [85]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[85]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [86]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [87]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [88]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [89]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9987931112255966
In [90]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [91]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002719376915070921
R2 Score: 0.9986387382697272
RMSE: 0.052148
Entropy Value: 0.00030312420047472904
In [92]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[92]:
feature importance
1 human_development_index 0.727041
5 population 0.146595
0 hospital_beds_per_thousand 0.068287
2 extreme_poverty 0.033873
3 gdp_per_capita 0.023341
4 population_density 0.000863
In [93]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[93]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [94]:
country1 = 'Spain'
country2 = 'Austria'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [95]:
df_updated
Out[95]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 31.4 83.56 19.436 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 31.4 83.56 19.436 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 31.4 83.56 19.436 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 31.4 83.56 19.436 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 31.4 83.56 19.436 0.855148

2102 rows × 9 columns

In [96]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [97]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [98]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[98]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [99]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [100]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [101]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [102]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [103]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[103]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [104]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [105]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [106]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [107]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.999159213416352
In [108]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [109]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006012348134524364
R2 Score: 0.9989310590870992
RMSE: 0.077539
Entropy Value: 0.00034449028141466515
In [110]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[110]:
feature importance
1 diabetes_prevalence 0.471275
0 cardiovasc_death_rate 0.425063
5 aged_65_older 0.059626
2 female_smokers 0.033989
3 male_smokers 0.009843
4 life_expectancy 0.000204
In [111]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[111]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [112]:
country1 = 'Spain'
country2 = 'Austria'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [113]:
df_updated
Out[113]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 2.97 0.904 1.0 34272.360 93.105 47558632 0.855148

2102 rows × 9 columns

In [114]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [115]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [116]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[116]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [117]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [118]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [119]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [120]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [121]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[121]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [122]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [123]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [124]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [125]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.998690899271246
In [126]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [127]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006240496359956682
R2 Score: 0.9988904964039489
RMSE: 0.078997
Entropy Value: 0.0005445524704743854
In [128]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[128]:
feature importance
1 human_development_index 0.524350
5 population 0.363217
0 hospital_beds_per_thousand 0.075626
2 extreme_poverty 0.027055
3 gdp_per_capita 0.009581
4 population_density 0.000171
In [129]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[129]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [130]:
country1 = 'Belgium'
country2 = 'Canada'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [131]:
df_updated
Out[131]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 25.1 31.4 81.63 18.571 0.000000
1040 Belgium 2/5/2020 114.898 4.29 25.1 31.4 81.63 18.571 0.000000
1041 Belgium 2/6/2020 114.898 4.29 25.1 31.4 81.63 18.571 0.000000
1042 Belgium 2/7/2020 114.898 4.29 25.1 31.4 81.63 18.571 0.000000
1043 Belgium 2/8/2020 114.898 4.29 25.1 31.4 81.63 18.571 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 1.093162

2132 rows × 9 columns

In [132]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [133]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [134]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[134]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [135]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [136]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [137]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [138]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [139]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[139]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [140]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [141]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [142]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [143]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990745539142948
In [144]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [145]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.014310712890414518
R2 Score: 0.9989178637912717
RMSE: 0.119627
Entropy Value: 0.00033461301188507007
In [146]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[146]:
feature importance
1 diabetes_prevalence 0.906030
0 cardiovasc_death_rate 0.054128
5 aged_65_older 0.022543
2 female_smokers 0.014168
3 male_smokers 0.003060
4 life_expectancy 0.000070
In [147]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[147]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [148]:
country1 = 'Belgium'
country2 = 'Canada'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [149]:
df_updated
Out[149]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 375.564 11655923 0.000000
... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.50 0.929 0.5 44017.591 4.037 38454328 1.093162

2132 rows × 9 columns

In [150]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [151]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [152]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[152]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [153]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [154]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [155]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [156]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [157]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[157]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [158]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [159]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [160]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [161]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982635754420282
In [162]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [163]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.021593332086563184
R2 Score: 0.9983671724325057
RMSE: 0.146947
Entropy Value: 0.0011894401573703628
In [164]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[164]:
feature importance
1 human_development_index 0.854404
0 hospital_beds_per_thousand 0.055223
2 extreme_poverty 0.054972
5 population 0.017190
3 gdp_per_capita 0.014188
4 population_density 0.004023
In [165]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[165]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [166]:
country1 = 'Czechia'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [167]:
df_updated
Out[167]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
4153 Czechia 3/1/2020 227.485 6.82 30.5 38.3 79.38 19.027 0.000000
4154 Czechia 3/2/2020 227.485 6.82 30.5 38.3 79.38 19.027 0.000000
4155 Czechia 3/3/2020 227.485 6.82 30.5 38.3 79.38 19.027 0.000000
4156 Czechia 3/4/2020 227.485 6.82 30.5 38.3 79.38 19.027 0.000000
4157 Czechia 3/5/2020 227.485 6.82 30.5 38.3 79.38 19.027 0.000000
... ... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 114.767 6.41 19.3 18.8 80.90 19.677 0.227772
6245 Denmark 12/26/2022 114.767 6.41 19.3 18.8 80.90 19.677 0.227772
6246 Denmark 12/27/2022 114.767 6.41 19.3 18.8 80.90 19.677 0.228905
6247 Denmark 12/28/2022 114.767 6.41 19.3 18.8 80.90 19.677 0.229131
6248 Denmark 12/29/2022 114.767 6.41 19.3 18.8 80.90 19.677 0.229131

2096 rows × 9 columns

In [168]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [169]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [170]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[170]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [171]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [172]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [173]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [174]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [175]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[175]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [176]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [177]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [178]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [179]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991757801493086
In [180]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [181]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0026568878338268037
R2 Score: 0.9977941834696354
RMSE: 0.051545
Entropy Value: 0.000440953658890907
In [182]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[182]:
feature importance
1 diabetes_prevalence 0.861674
5 aged_65_older 0.091122
0 cardiovasc_death_rate 0.032199
2 female_smokers 0.014190
3 male_smokers 0.000679
4 life_expectancy 0.000135
In [183]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[183]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [184]:
country1 = 'Czechia'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [185]:
df_updated
Out[185]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
4153 Czechia 3/1/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4154 Czechia 3/2/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4155 Czechia 3/3/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4156 Czechia 3/4/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
4157 Czechia 3/5/2020 6.63 0.90 0.0 32605.906 137.176 10493990 0.000000
... ... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.227772
6245 Denmark 12/26/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.227772
6246 Denmark 12/27/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.228905
6247 Denmark 12/28/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.229131
6248 Denmark 12/29/2022 2.50 0.94 0.2 46682.515 136.520 5882259 0.229131

2096 rows × 9 columns

In [186]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [187]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [188]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[188]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [189]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [190]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [191]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [192]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [193]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[193]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [194]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [195]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [196]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [197]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990802017690633
In [198]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [199]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0018119703339651765
R2 Score: 0.9984956556824479
RMSE: 0.042567
Entropy Value: 0.0003609256883659556
In [200]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[200]:
feature importance
1 human_development_index 0.672643
0 hospital_beds_per_thousand 0.166905
5 population 0.134752
2 extreme_poverty 0.022526
3 gdp_per_capita 0.002956
4 population_density 0.000219
In [201]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[201]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [202]:
country1 = 'Estonia'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [203]:
df_updated
Out[203]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 0.00000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 19.452 0.00000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 19.452 0.00000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 0.00000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 19.452 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 153.507 5.76 18.3 22.6 81.91 21.228 0.55159
8372 Finland 12/26/2022 153.507 5.76 18.3 22.6 81.91 21.228 0.55159
8373 Finland 12/27/2022 153.507 5.76 18.3 22.6 81.91 21.228 0.55159
8374 Finland 12/28/2022 153.507 5.76 18.3 22.6 81.91 21.228 0.55159
8375 Finland 12/29/2022 153.507 5.76 18.3 22.6 81.91 21.228 0.55159

2127 rows × 9 columns

In [204]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [205]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [206]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[206]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [207]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [208]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [209]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [210]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [211]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[211]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [212]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [213]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [214]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [215]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9971001469654569
In [216]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [217]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00293040781405009
R2 Score: 0.9973865357349957
RMSE: 0.054133
Entropy Value: 0.0007814952239726046
In [218]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[218]:
feature importance
1 diabetes_prevalence 0.877787
5 aged_65_older 0.056560
0 cardiovasc_death_rate 0.031452
2 female_smokers 0.018166
3 male_smokers 0.011544
4 life_expectancy 0.004491
In [219]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[219]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [220]:
country1 = 'Estonia'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [221]:
df_updated
Out[221]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.00000
6250 Estonia 1/18/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.00000
6251 Estonia 2/5/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.00000
6252 Estonia 2/6/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.00000
6253 Estonia 2/7/2020 4.69 0.892 0.50 29481.252 31.033 1326064 0.00000
... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8372 Finland 12/26/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8373 Finland 12/27/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8374 Finland 12/28/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159
8375 Finland 12/29/2022 3.28 0.938 0.04 40585.721 18.136 5540745 0.55159

2127 rows × 9 columns

In [222]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [223]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [224]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[224]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [225]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [226]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [227]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [228]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [229]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[229]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [230]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [231]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [232]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [233]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9973522032432347
In [234]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [235]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002073800874034314
R2 Score: 0.9981504948044987
RMSE: 0.045539
Entropy Value: 0.0008383799779777811
In [236]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[236]:
feature importance
1 human_development_index 0.825465
5 population 0.069406
0 hospital_beds_per_thousand 0.057109
2 extreme_poverty 0.029447
3 gdp_per_capita 0.014664
4 population_density 0.003909
In [237]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[237]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [238]:
country1 = 'France'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [239]:
df_updated
Out[239]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
8376 France 1/24/2020 86.06 4.77 30.1 35.6 82.66 19.718 0.000000
8377 France 1/25/2020 86.06 4.77 30.1 35.6 82.66 19.718 0.000000
8378 France 1/26/2020 86.06 4.77 30.1 35.6 82.66 19.718 0.000000
8379 France 1/27/2020 86.06 4.77 30.1 35.6 82.66 19.718 0.000000
8380 France 1/28/2020 86.06 4.77 30.1 35.6 82.66 19.718 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.06 4.91 25.6 51.0 75.29 19.754 0.631631
20907 Latvia 12/26/2022 350.06 4.91 25.6 51.0 75.29 19.754 0.631631
20908 Latvia 12/27/2022 350.06 4.91 25.6 51.0 75.29 19.754 0.631485
20909 Latvia 12/28/2022 350.06 4.91 25.6 51.0 75.29 19.754 0.631485
20910 Latvia 12/29/2022 350.06 4.91 25.6 51.0 75.29 19.754 0.631969

2109 rows × 9 columns

In [240]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [241]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [242]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[242]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [243]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [244]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [245]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [246]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [247]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[247]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [248]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [249]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [250]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [251]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9943677613139872
In [252]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [253]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.04041345514758587
R2 Score: 0.9964586270192674
RMSE: 0.201031
Entropy Value: 0.0012288161708932492
In [254]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[254]:
feature importance
1 diabetes_prevalence 0.730534
0 cardiovasc_death_rate 0.181573
5 aged_65_older 0.041538
2 female_smokers 0.030633
3 male_smokers 0.011895
4 life_expectancy 0.003826
In [255]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[255]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [256]:
country1 = 'France'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [257]:
df_updated
Out[257]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
8376 France 1/24/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.000000
8377 France 1/25/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.000000
8378 France 1/26/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.000000
8379 France 1/27/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.000000
8380 France 1/28/2020 5.98 0.901 0.02 38605.671 122.578 67813000 0.000000
... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.70 25063.846 31.212 1850654 0.631969

2109 rows × 9 columns

In [258]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [259]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [260]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[260]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [261]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [262]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [263]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [264]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [265]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[265]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [266]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [267]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [268]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [269]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.993812574670508
In [270]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [271]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.04872995253687114
R2 Score: 0.9957298643079082
RMSE: 0.220749
Entropy Value: 0.0019759927941236785
In [272]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[272]:
feature importance
1 human_development_index 0.867631
0 hospital_beds_per_thousand 0.063217
3 gdp_per_capita 0.022920
2 extreme_poverty 0.021983
5 population 0.018837
4 population_density 0.005412
In [273]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[273]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [274]:
country1 = 'Netherlands'
country2 = 'Romania'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [275]:
df_updated
Out[275]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 37.1 76.05 17.850 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 37.1 76.05 17.850 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 37.1 76.05 17.850 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 37.1 76.05 17.850 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 37.1 76.05 17.850 2.036403

2075 rows × 9 columns

In [276]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [277]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [278]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[278]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [279]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [280]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [281]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [282]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [283]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[283]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [284]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [285]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [286]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [287]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9993100745193948
In [288]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [289]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0036117981905216217
R2 Score: 0.99953160146208
RMSE: 0.060098
Entropy Value: 0.00019551055515040766
In [290]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[290]:
feature importance
1 diabetes_prevalence 0.746227
5 aged_65_older 0.150810
0 cardiovasc_death_rate 0.090035
2 female_smokers 0.012561
3 male_smokers 0.000259
4 life_expectancy 0.000107
In [291]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[291]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [292]:
country1 = 'Netherlands'
country2 = 'Romania'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [293]:
df_updated
Out[293]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.320 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.320 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.320 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.320 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.320 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.7 23313.199 85.129 19659270 2.036403

2075 rows × 9 columns

In [294]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [295]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [296]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[296]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [297]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [298]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [299]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [300]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [301]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[301]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [302]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [303]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [304]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [305]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992177391087653
In [306]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [307]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0043282460748784585
R2 Score: 0.9994386884243556
RMSE: 0.065789
Entropy Value: 0.00015036708477313637
In [308]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[308]:
feature importance
0 hospital_beds_per_thousand 0.457357
1 human_development_index 0.431411
5 population 0.098481
2 extreme_poverty 0.011752
3 gdp_per_capita 0.000915
4 population_density 0.000084
In [320]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[320]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [321]:
country1 = 'Serbia'
country2 = 'Slovakia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [322]:
df_updated
Out[322]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 15.070 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 15.070 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 15.070 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 15.070 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 15.070 0.000000
... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 439.415 10.08 37.7 40.2 76.00 17.366 0.717058
16755 Serbia 12/26/2022 439.415 10.08 37.7 40.2 76.00 17.366 0.716963
16756 Serbia 12/27/2022 439.415 10.08 37.7 40.2 76.00 17.366 0.716677
16757 Serbia 12/28/2022 439.415 10.08 37.7 40.2 76.00 17.366 0.716395
16758 Serbia 12/29/2022 439.415 10.08 37.7 40.2 76.00 17.366 0.716205

2067 rows × 9 columns

In [323]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [324]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [325]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[325]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [326]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [327]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [328]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [329]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [330]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[330]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [331]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [332]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [333]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [334]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9960436315500438
In [335]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [336]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0007537855022416325
R2 Score: 0.9969151482074694
RMSE: 0.027455
Entropy Value: inf
In [337]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[337]:
feature importance
1 diabetes_prevalence 0.603285
0 cardiovasc_death_rate 0.222175
5 aged_65_older 0.118124
2 female_smokers 0.025456
4 life_expectancy 0.023034
3 male_smokers 0.007925
In [338]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[338]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [339]:
country1 = 'Serbia'
country2 = 'Slovakia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [340]:
df_updated
Out[340]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.820 0.860 0.70 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.717058
16755 Serbia 12/26/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716963
16756 Serbia 12/27/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716677
16757 Serbia 12/28/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716395
16758 Serbia 12/29/2022 5.609 0.806 0.05 14048.881 80.291 6871547 0.716205

2067 rows × 9 columns

In [341]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [342]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [343]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[343]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [344]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [345]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [346]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [347]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [348]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[348]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [349]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [350]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [351]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [352]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9967966965293827
In [353]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [354]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0008619570126196017
R2 Score: 0.9964724585076836
RMSE: 0.029359
Entropy Value: inf
In [355]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[355]:
feature importance
1 human_development_index 0.429013
0 hospital_beds_per_thousand 0.351790
5 population 0.185761
2 extreme_poverty 0.022991
3 gdp_per_capita 0.005310
4 population_density 0.005134
In [356]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[356]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [357]:
country1 = 'Sweden'
country2 = 'Switzerland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [358]:
df_updated
Out[358]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
13610 Switzerland 2/25/2020 99.739 5.59 22.6 28.9 83.78 18.436 0.000000
13611 Switzerland 2/26/2020 99.739 5.59 22.6 28.9 83.78 18.436 0.000000
13612 Switzerland 2/27/2020 99.739 5.59 22.6 28.9 83.78 18.436 0.000000
13613 Switzerland 2/28/2020 99.739 5.59 22.6 28.9 83.78 18.436 0.000000
13614 Switzerland 2/29/2020 99.739 5.59 22.6 28.9 83.78 18.436 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 18.9 82.80 19.985 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 18.9 82.80 19.985 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 18.9 82.80 19.985 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 18.9 82.80 19.985 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 18.9 82.80 19.985 0.816005

2102 rows × 9 columns

In [359]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [360]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [361]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[361]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [362]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [363]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [364]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [365]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [366]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[366]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [367]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [368]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [369]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [370]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986502721479346
In [371]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [372]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.012007379222393649
R2 Score: 0.997706010036352
RMSE: 0.109578
Entropy Value: 0.0007943736902363641
In [373]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[373]:
feature importance
1 diabetes_prevalence 0.552534
0 cardiovasc_death_rate 0.197404
5 aged_65_older 0.146890
3 male_smokers 0.064874
2 female_smokers 0.037203
4 life_expectancy 0.001096
In [374]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[374]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [375]:
country1 = 'Sweden'
country2 = 'Switzerland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [376]:
df_updated
Out[376]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.955 0.03 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.50 46949.283 24.718 10549349 0.816005

2102 rows × 9 columns

In [377]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [378]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [379]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[379]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [380]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [381]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [382]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [383]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [384]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[384]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [385]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [386]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [387]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [388]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9985258910599774
In [389]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [390]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009780822718122402
R2 Score: 0.9981313899781104
RMSE: 0.098898
Entropy Value: 0.0007038834783452404
In [391]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[391]:
feature importance
1 human_development_index 0.543966
5 population 0.270168
0 hospital_beds_per_thousand 0.096949
3 gdp_per_capita 0.051447
2 extreme_poverty 0.037239
4 population_density 0.000231
In [392]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[392]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [393]:
country1 = 'Cyprus'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [394]:
df_updated
Out[394]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 13.416 0.00000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 13.416 0.00000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 13.416 0.00000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 13.416 0.00000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 13.416 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 14.431 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 14.431 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 14.431 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 14.431 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 14.431 0.11011

2063 rows × 9 columns

In [395]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [396]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [397]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[397]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [398]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [399]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [400]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [401]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [402]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[402]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [403]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [404]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [405]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [406]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.988459659933665
In [407]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [408]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0004198357264663966
R2 Score: 0.9978595292846001
RMSE: 0.020490
Entropy Value: 0.0008321936994172394
In [409]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[409]:
feature importance
1 diabetes_prevalence 0.622458
0 cardiovasc_death_rate 0.235917
5 aged_65_older 0.061143
3 male_smokers 0.032189
2 female_smokers 0.026247
4 life_expectancy 0.022046
In [410]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[410]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [411]:
country1 = 'Cyprus'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [412]:
df_updated
Out[412]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3127 Cyprus 3/9/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3128 Cyprus 3/10/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3129 Cyprus 3/11/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
3130 Cyprus 3/12/2020 3.40 0.887 0.15 32415.132 127.657 896007 0.00000
... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.20 46482.958 3.404 372903 0.11011

2063 rows × 9 columns

In [413]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [414]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [415]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[415]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [416]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [417]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [418]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [419]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [420]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[420]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [421]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [422]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [423]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [424]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.988544640680141
In [425]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [426]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0009590270117916058
R2 Score: 0.9951105418033505
RMSE: 0.030968
Entropy Value: 0.0017192056737414868
In [427]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[427]:
feature importance
1 human_development_index 0.542660
0 hospital_beds_per_thousand 0.171023
2 extreme_poverty 0.106136
3 gdp_per_capita 0.073260
4 population_density 0.055552
5 population 0.051370
In [428]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[428]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [49]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [50]:
df_updated
Out[50]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
16759 Luxembourg 2/12/2020 128.275 4.42 20.9 26.0 82.25 14.312 0.000000
16760 Luxembourg 2/24/2020 128.275 4.42 20.9 26.0 82.25 14.312 0.000000
16761 Luxembourg 2/25/2020 128.275 4.42 20.9 26.0 82.25 14.312 0.000000
16762 Luxembourg 2/26/2020 128.275 4.42 20.9 26.0 82.25 14.312 0.000000
16763 Luxembourg 2/27/2020 128.275 4.42 20.9 26.0 82.25 14.312 0.000000
... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 126.459 3.28 23.0 25.7 82.30 13.928 0.491388
19869 Ireland 12/26/2022 126.459 3.28 23.0 25.7 82.30 13.928 0.491388
19870 Ireland 12/27/2022 126.459 3.28 23.0 25.7 82.30 13.928 0.491388
19871 Ireland 12/28/2022 126.459 3.28 23.0 25.7 82.30 13.928 0.491388
19872 Ireland 12/29/2022 126.459 3.28 23.0 25.7 82.30 13.928 0.491388

2076 rows × 9 columns

In [51]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [52]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [53]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[53]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [54]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [55]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [56]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [57]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [58]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[58]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [59]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [60]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [61]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [62]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987342751999154
In [63]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [64]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0012823484938275537
R2 Score: 0.9994384293418184
RMSE: 0.035810
Entropy Value: 0.000229301840245004
In [65]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[65]:
feature importance
5 aged_65_older 0.747254
0 cardiovasc_death_rate 0.176164
1 diabetes_prevalence 0.054523
2 female_smokers 0.012670
4 life_expectancy 0.005541
3 male_smokers 0.003848
In [66]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[66]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [67]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [68]:
df_updated
Out[68]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
16759 Luxembourg 2/12/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 4.51 0.916 0.2 94277.965 231.447 647601 0.000000
... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19869 Ireland 12/26/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19870 Ireland 12/27/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19871 Ireland 12/28/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388
19872 Ireland 12/29/2022 2.96 0.955 0.2 67335.293 69.874 5023108 0.491388

2076 rows × 9 columns

In [69]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [70]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [71]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[71]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [72]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [73]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [74]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [75]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [76]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[76]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [77]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [78]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [79]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [80]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986051830769147
In [81]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [82]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0015122598797065847
R2 Score: 0.9993377457219498
RMSE: 0.038888
Entropy Value: 0.0003022118010805689
In [83]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[83]:
feature importance
5 population 0.715384
1 human_development_index 0.205967
0 hospital_beds_per_thousand 0.052834
2 extreme_poverty 0.024062
3 gdp_per_capita 0.001458
4 population_density 0.000295
In [464]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[464]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [54]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [55]:
df_updated
Out[55]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 81.32 18.517 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 81.32 18.517 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 81.32 18.517 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 81.32 18.517 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 81.32 18.517 22.222222
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 1.084791

2136 rows × 9 columns

In [56]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [57]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [58]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[58]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [59]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [60]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older'])
principal_df['location'] = df_updated['location'].values
In [61]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [62]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [63]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[63]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [64]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [65]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [66]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [67]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9552082375777268
In [68]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [69]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  2.003468587162617
R2 Score: 0.9173883860485977
RMSE: 1.415439
Entropy Value: 0.008328296781842388
In [70]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[70]:
feature importance
0 cardiovasc_death_rate 0.466369
1 diabetes_prevalence 0.261525
5 aged_65_older 0.146323
2 female_smokers 0.114804
4 life_expectancy 0.007744
3 male_smokers 0.003235
In [71]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[71]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [72]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [73]:
df_updated
Out[73]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.2 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.2 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.2 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.2 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791

2136 rows × 9 columns

In [74]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [75]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [76]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[76]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [77]:
# Setting the number of principal components to 6 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 6  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [78]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [79]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [80]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [81]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[81]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [82]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [83]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [84]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [85]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9566213518011253
In [86]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [87]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.713063858863409
R2 Score: 0.9705973147727024
RMSE: 0.844431
Entropy Value: 0.008268179051644176
In [88]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[88]:
feature importance
0 hospital_beds_per_thousand 0.302760
1 human_development_index 0.227340
5 population 0.207927
4 population_density 0.146404
2 extreme_poverty 0.077408
3 gdp_per_capita 0.038161
In [3]:
# Country Pair by Pair Analysis relative to population density
In [4]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[4]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [5]:
# Showing the pairings of countries based on population density (13 pairs of countries)
df_Bulgaria = df[(df.location == "Bulgaria")]
df_Canada = df[(df.location == "Canada")]

df_Estonia = df[(df.location == "Estonia")]
df_Finland = df[(df.location == "Finland")]

df_Iceland = df[(df.location == "Iceland")]
df_Ireland = df[(df.location == "Ireland")]

df_Latvia = df[(df.location == "Latvia")]
df_Romania = df[(df.location == "Romania")]

df_Serbia = df[(df.location == "Serbia")]
df_Spain = df[(df.location == "Spain")]

df_Sweden = df[(df.location == "Sweden")]
df_UnitedStates = df[(df.location == "United States")]

df_Austria = df[(df.location == "Austria")]
df_Cyprus = df[(df.location == "Cyprus")]

df_Czechia = df[(df.location == "Czechia")]
df_Denmark = df[(df.location == "Denmark")]

df_France = df[(df.location == "France")]
df_Portugal = df[(df.location == "Portugal")]

df_Slovakia = df[(df.location == "Slovakia")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Belgium = df[(df.location == "Belgium")]
df_Italy = df[(df.location == "Italy")]

df_Luxembourg = df[(df.location == "Luxembourg")]
df_Netherlands = df[(df.location == "Netherlands")]

df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedKingdom = df[(df.location == "United Kingdom")]
In [6]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [7]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [8]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[8]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [9]:
country1 = 'Bulgaria'
country2 = 'Canada'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [10]:
df_updated
Out[10]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2099 rows × 10 columns

In [11]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [12]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [13]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[13]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [14]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [15]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [16]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [17]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [18]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[18]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [19]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [20]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [21]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [22]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9773211046544897
In [23]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [24]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004743237035315944
R2 Score: 0.9984501610588699
RMSE: 0.068871
Entropy Value: 0.0003533082890357041
In [25]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[25]:
feature importance
1 diabetes_prevalence 0.565595
0 cardiovasc_death_rate 0.248605
5 aged_65_older 0.120960
2 female_smokers 0.026608
4 life_expectancy 0.015588
3 male_smokers 0.014215
6 median_age 0.008429
In [26]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[26]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [27]:
country1 = 'Bulgaria'
country2 = 'Canada'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [28]:
df_updated
Out[28]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 18563.307 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 18563.307 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 18563.307 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 18563.307 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 18563.307 6781955 14.285714
... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.500 0.929 0.5 44017.591 38454328 1.092509
15717 Canada 12/26/2022 2.500 0.929 0.5 44017.591 38454328 1.092338
15718 Canada 12/27/2022 2.500 0.929 0.5 44017.591 38454328 1.092196
15719 Canada 12/28/2022 2.500 0.929 0.5 44017.591 38454328 1.092321
15720 Canada 12/29/2022 2.500 0.929 0.5 44017.591 38454328 1.093162

2099 rows × 8 columns

In [29]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [30]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [31]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[31]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [32]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [33]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [34]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [35]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [36]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[36]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [37]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [38]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [39]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [40]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9762845281500339
In [41]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [42]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00798631766145176
R2 Score: 0.9973904938724765
RMSE: 0.089366
Entropy Value: inf
In [43]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[43]:
feature importance
1 human_development_index 0.477353
0 hospital_beds_per_thousand 0.366962
4 population 0.082307
2 extreme_poverty 0.054872
3 gdp_per_capita 0.018506
In [44]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[44]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [45]:
country1 = 'Estonia'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [46]:
df_updated
Out[46]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
... ... ... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8372 Finland 12/26/2022 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8373 Finland 12/27/2022 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8374 Finland 12/28/2022 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.55159
8375 Finland 12/29/2022 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.55159

2127 rows × 10 columns

In [47]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [48]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [49]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[49]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [50]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [51]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [52]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [53]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [54]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[54]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [55]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [56]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [57]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [58]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.996967666861275
In [59]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [60]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002243279651619481
R2 Score: 0.9979993463101589
RMSE: 0.047363
Entropy Value: inf
In [61]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[61]:
feature importance
1 diabetes_prevalence 0.911358
5 aged_65_older 0.038886
0 cardiovasc_death_rate 0.022946
2 female_smokers 0.011241
6 median_age 0.008843
3 male_smokers 0.004015
4 life_expectancy 0.002711
In [62]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[62]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [63]:
country1 = 'Estonia'
country2 = 'Finland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [64]:
df_updated
Out[64]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 0.50 29481.252 1326064 0.00000
6250 Estonia 1/18/2020 4.69 0.892 0.50 29481.252 1326064 0.00000
6251 Estonia 2/5/2020 4.69 0.892 0.50 29481.252 1326064 0.00000
6252 Estonia 2/6/2020 4.69 0.892 0.50 29481.252 1326064 0.00000
6253 Estonia 2/7/2020 4.69 0.892 0.50 29481.252 1326064 0.00000
... ... ... ... ... ... ... ... ...
8371 Finland 12/25/2022 3.28 0.938 0.04 40585.721 5540745 0.55159
8372 Finland 12/26/2022 3.28 0.938 0.04 40585.721 5540745 0.55159
8373 Finland 12/27/2022 3.28 0.938 0.04 40585.721 5540745 0.55159
8374 Finland 12/28/2022 3.28 0.938 0.04 40585.721 5540745 0.55159
8375 Finland 12/29/2022 3.28 0.938 0.04 40585.721 5540745 0.55159

2127 rows × 8 columns

In [65]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [66]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [67]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[67]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [68]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [69]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [70]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [71]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [72]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[72]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [73]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [74]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [75]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [76]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9961922448004852
In [77]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [78]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002853827219619632
R2 Score: 0.9974548336169414
RMSE: 0.053421
Entropy Value: 0.0013584415315992247
In [79]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[79]:
feature importance
1 human_development_index 0.892065
2 extreme_poverty 0.046066
0 hospital_beds_per_thousand 0.034299
3 gdp_per_capita 0.022410
4 population 0.005159
In [80]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[80]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [81]:
country1 = 'Iceland'
country2 = 'Ireland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [82]:
df_updated
Out[82]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.00000
18839 Ireland 3/1/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.00000
18840 Ireland 3/2/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.00000
18841 Ireland 3/3/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.00000
18842 Ireland 3/4/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.00000
... ... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011

2071 rows × 10 columns

In [83]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [84]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [85]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[85]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [86]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [87]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [88]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [89]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [90]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[90]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [91]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [92]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [93]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [94]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990289844187075
In [95]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [96]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0016794259329685336
R2 Score: 0.9993768015022259
RMSE: 0.040981
Entropy Value: 0.00039991375165087696
In [97]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[97]:
feature importance
1 diabetes_prevalence 0.656345
5 aged_65_older 0.175855
0 cardiovasc_death_rate 0.130769
6 median_age 0.021844
2 female_smokers 0.009989
4 life_expectancy 0.004495
3 male_smokers 0.000703
In [98]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[98]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [99]:
country1 = 'Iceland'
country2 = 'Ireland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [100]:
df_updated
Out[100]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
18838 Ireland 2/29/2020 2.96 0.955 0.2 67335.293 5023108 0.00000
18839 Ireland 3/1/2020 2.96 0.955 0.2 67335.293 5023108 0.00000
18840 Ireland 3/2/2020 2.96 0.955 0.2 67335.293 5023108 0.00000
18841 Ireland 3/3/2020 2.96 0.955 0.2 67335.293 5023108 0.00000
18842 Ireland 3/4/2020 2.96 0.955 0.2 67335.293 5023108 0.00000
... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 0.2 46482.958 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 0.2 46482.958 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 0.2 46482.958 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 0.2 46482.958 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 0.2 46482.958 372903 0.11011

2071 rows × 8 columns

In [101]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [102]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [103]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[103]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [104]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [105]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [106]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [107]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [108]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[108]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [109]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [110]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [111]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [112]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9976029122392246
In [113]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [114]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005769156505449668
R2 Score: 0.9978591912884988
RMSE: 0.075955
Entropy Value: 0.002695782621398591
In [115]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[115]:
feature importance
1 human_development_index 0.588998
0 hospital_beds_per_thousand 0.268803
2 extreme_poverty 0.050567
3 gdp_per_capita 0.049935
4 population 0.041696
In [116]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[116]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [117]:
country1 = 'Latvia'
country2 = 'Romania'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [118]:
df_updated
Out[118]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
17800 Romania 2/26/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17801 Romania 2/27/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17802 Romania 2/28/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17803 Romania 2/29/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17804 Romania 3/1/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
... ... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631969

2076 rows × 10 columns

In [119]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [120]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [121]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[121]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [122]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [123]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [124]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [125]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [126]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[126]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [127]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [128]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [129]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [130]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984971016962156
In [131]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [132]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0022732963920699734
R2 Score: 0.9984516084013416
RMSE: 0.047679
Entropy Value: inf
In [133]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[133]:
feature importance
1 diabetes_prevalence 0.373051
0 cardiovasc_death_rate 0.370197
6 median_age 0.174235
5 aged_65_older 0.065719
2 female_smokers 0.014828
3 male_smokers 0.001702
4 life_expectancy 0.000267
In [134]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[134]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [135]:
country1 = 'Latvia'
country2 = 'Romania'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [136]:
df_updated
Out[136]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
17800 Romania 2/26/2020 6.892 0.828 5.7 23313.199 19659270 0.000000
17801 Romania 2/27/2020 6.892 0.828 5.7 23313.199 19659270 0.000000
17802 Romania 2/28/2020 6.892 0.828 5.7 23313.199 19659270 0.000000
17803 Romania 2/29/2020 6.892 0.828 5.7 23313.199 19659270 0.000000
17804 Romania 3/1/2020 6.892 0.828 5.7 23313.199 19659270 0.000000
... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.570 0.866 0.7 25063.846 1850654 0.631631
20907 Latvia 12/26/2022 5.570 0.866 0.7 25063.846 1850654 0.631631
20908 Latvia 12/27/2022 5.570 0.866 0.7 25063.846 1850654 0.631485
20909 Latvia 12/28/2022 5.570 0.866 0.7 25063.846 1850654 0.631485
20910 Latvia 12/29/2022 5.570 0.866 0.7 25063.846 1850654 0.631969

2076 rows × 8 columns

In [137]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [138]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [139]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[139]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [140]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [141]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [142]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [143]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [144]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[144]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [145]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [146]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [147]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [148]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9969490880205945
In [149]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [150]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00261187990421846
R2 Score: 0.9982209918097331
RMSE: 0.051107
Entropy Value: inf
In [151]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[151]:
feature importance
0 hospital_beds_per_thousand 0.655538
1 human_development_index 0.257969
2 extreme_poverty 0.059863
3 gdp_per_capita 0.025707
4 population 0.000922
In [152]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[152]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [153]:
country1 = 'Serbia'
country2 = 'Spain'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [154]:
df_updated
Out[154]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
15721 Serbia 2/26/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15722 Serbia 2/27/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15723 Serbia 2/28/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15724 Serbia 2/29/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15725 Serbia 3/1/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148

2101 rows × 10 columns

In [155]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [156]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [157]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[157]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [158]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [159]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [160]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [161]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [162]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[162]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [163]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [164]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [165]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [166]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.998793696055355
In [167]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [168]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0068977309284879535
R2 Score: 0.9988942023582971
RMSE: 0.083053
Entropy Value: 0.000581703964477074
In [169]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[169]:
feature importance
1 diabetes_prevalence 0.588572
5 aged_65_older 0.190679
0 cardiovasc_death_rate 0.163446
6 median_age 0.036807
2 female_smokers 0.016779
3 male_smokers 0.003627
4 life_expectancy 0.000090
In [170]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[170]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [171]:
country1 = 'Serbia'
country2 = 'Spain'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [172]:
df_updated
Out[172]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
15721 Serbia 2/26/2020 5.609 0.806 0.05 14048.881 6871547 0.000000
15722 Serbia 2/27/2020 5.609 0.806 0.05 14048.881 6871547 0.000000
15723 Serbia 2/28/2020 5.609 0.806 0.05 14048.881 6871547 0.000000
15724 Serbia 2/29/2020 5.609 0.806 0.05 14048.881 6871547 0.000000
15725 Serbia 3/1/2020 5.609 0.806 0.05 14048.881 6871547 0.000000
... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.970 0.904 1.00 34272.360 47558632 0.855148
25133 Spain 12/26/2022 2.970 0.904 1.00 34272.360 47558632 0.855148
25134 Spain 12/27/2022 2.970 0.904 1.00 34272.360 47558632 0.855148
25135 Spain 12/28/2022 2.970 0.904 1.00 34272.360 47558632 0.855148
25136 Spain 12/29/2022 2.970 0.904 1.00 34272.360 47558632 0.855148

2101 rows × 8 columns

In [173]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [174]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [175]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[175]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [176]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [177]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [178]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [179]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [180]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[180]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [181]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [182]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [183]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [184]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985077942246274
In [185]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [186]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.011012684782410768
R2 Score: 0.9982345207449435
RMSE: 0.104941
Entropy Value: 0.0011198641869641568
In [187]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[187]:
feature importance
1 human_development_index 0.575847
0 hospital_beds_per_thousand 0.191762
2 extreme_poverty 0.164986
3 gdp_per_capita 0.035551
4 population 0.031854
In [188]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[188]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [189]:
country1 = 'Sweden'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [190]:
df_updated
Out[190]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
23011 Sweden 2/1/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23012 Sweden 2/2/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23013 Sweden 2/3/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23014 Sweden 2/4/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23015 Sweden 2/5/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
... ... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 10 columns

In [191]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [192]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [193]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[193]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [194]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [195]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [196]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [197]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [198]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[198]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [199]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [200]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [201]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [202]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9977617836323432
In [203]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [204]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.06163465865693719
R2 Score: 0.9881701775260999
RMSE: 0.248263
Entropy Value: 0.002275727343611559
In [205]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[205]:
feature importance
1 diabetes_prevalence 0.757595
0 cardiovasc_death_rate 0.141108
5 aged_65_older 0.063089
2 female_smokers 0.020350
3 male_smokers 0.009161
6 median_age 0.007329
4 life_expectancy 0.001368
In [206]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[206]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [207]:
country1 = 'Sweden'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [208]:
df_updated
Out[208]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
23011 Sweden 2/1/2020 2.22 0.945 0.5 46949.283 10549349 0.000000
23012 Sweden 2/2/2020 2.22 0.945 0.5 46949.283 10549349 0.000000
23013 Sweden 2/3/2020 2.22 0.945 0.5 46949.283 10549349 0.000000
23014 Sweden 2/4/2020 2.22 0.945 0.5 46949.283 10549349 0.000000
23015 Sweden 2/5/2020 2.22 0.945 0.5 46949.283 10549349 0.000000
... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.2 54225.446 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.2 54225.446 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.2 54225.446 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.2 54225.446 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.2 54225.446 338289856 1.084791

2136 rows × 8 columns

In [209]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [210]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [211]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[211]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [212]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [213]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [214]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [215]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [216]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[216]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [217]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [218]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [219]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [220]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9964446927359324
In [221]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [222]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.05830853179428446
R2 Score: 0.9888085762966661
RMSE: 0.241472
Entropy Value: 0.002494604791450158
In [223]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[223]:
feature importance
1 human_development_index 0.556230
2 extreme_poverty 0.233704
0 hospital_beds_per_thousand 0.101283
3 gdp_per_capita 0.097960
4 population 0.010821
In [224]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[224]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [225]:
country1 = 'Austria'
country2 = 'Cyprus'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [226]:
df_updated
Out[226]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ... ...
4148 Cyprus 12/25/2022 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.199679
4149 Cyprus 12/26/2022 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.199679
4150 Cyprus 12/27/2022 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.199679
4151 Cyprus 12/28/2022 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.199679
4152 Cyprus 12/29/2022 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.199679

2066 rows × 10 columns

In [227]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [228]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [229]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[229]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [230]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [231]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [232]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [233]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [234]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[234]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [235]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [236]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [237]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [238]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9977294625245022
In [239]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [240]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0011518898943072675
R2 Score: 0.9989790639681787
RMSE: 0.033940
Entropy Value: 0.0004452189828876654
In [241]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[241]:
feature importance
1 diabetes_prevalence 0.681072
0 cardiovasc_death_rate 0.265483
2 female_smokers 0.021585
6 median_age 0.015140
5 aged_65_older 0.014290
3 male_smokers 0.001557
4 life_expectancy 0.000873
In [242]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[242]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [243]:
country1 = 'Austria'
country2 = 'Cyprus'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [244]:
df_updated
Out[244]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.70 45436.686 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.70 45436.686 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.70 45436.686 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.70 45436.686 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.70 45436.686 8939617 0.000000
... ... ... ... ... ... ... ... ...
4148 Cyprus 12/25/2022 3.40 0.887 0.15 32415.132 896007 0.199679
4149 Cyprus 12/26/2022 3.40 0.887 0.15 32415.132 896007 0.199679
4150 Cyprus 12/27/2022 3.40 0.887 0.15 32415.132 896007 0.199679
4151 Cyprus 12/28/2022 3.40 0.887 0.15 32415.132 896007 0.199679
4152 Cyprus 12/29/2022 3.40 0.887 0.15 32415.132 896007 0.199679

2066 rows × 8 columns

In [245]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [246]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [247]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[247]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [248]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [249]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [250]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [251]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [252]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[252]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [253]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [254]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [255]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [256]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9936138289842674
In [257]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [258]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003065469000387181
R2 Score: 0.9972830321957044
RMSE: 0.055367
Entropy Value: 0.0009279980175255162
In [259]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[259]:
feature importance
1 human_development_index 0.602723
0 hospital_beds_per_thousand 0.253940
4 population 0.063896
2 extreme_poverty 0.044890
3 gdp_per_capita 0.034551
In [260]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[260]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [261]:
country1 = 'Czechia'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [262]:
df_updated
Out[262]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
4153 Czechia 3/1/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4154 Czechia 3/2/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4155 Czechia 3/3/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4156 Czechia 3/4/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4157 Czechia 3/5/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.227772
6245 Denmark 12/26/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.227772
6246 Denmark 12/27/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.228905
6247 Denmark 12/28/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.229131
6248 Denmark 12/29/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.229131

2096 rows × 10 columns

In [263]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [264]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [265]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[265]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [266]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [267]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [268]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [269]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [270]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[270]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [271]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [272]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [273]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [274]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990768399961493
In [275]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [276]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002233914255848466
R2 Score: 0.9981453469995119
RMSE: 0.047264
Entropy Value: 0.0004229517361525248
In [277]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[277]:
feature importance
1 diabetes_prevalence 0.817142
5 aged_65_older 0.058152
0 cardiovasc_death_rate 0.051193
6 median_age 0.048701
2 female_smokers 0.023894
3 male_smokers 0.000842
4 life_expectancy 0.000075
In [278]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[278]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [279]:
country1 = 'Czechia'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [280]:
df_updated
Out[280]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
4153 Czechia 3/1/2020 6.63 0.90 0.0 32605.906 10493990 0.000000
4154 Czechia 3/2/2020 6.63 0.90 0.0 32605.906 10493990 0.000000
4155 Czechia 3/3/2020 6.63 0.90 0.0 32605.906 10493990 0.000000
4156 Czechia 3/4/2020 6.63 0.90 0.0 32605.906 10493990 0.000000
4157 Czechia 3/5/2020 6.63 0.90 0.0 32605.906 10493990 0.000000
... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 2.50 0.94 0.2 46682.515 5882259 0.227772
6245 Denmark 12/26/2022 2.50 0.94 0.2 46682.515 5882259 0.227772
6246 Denmark 12/27/2022 2.50 0.94 0.2 46682.515 5882259 0.228905
6247 Denmark 12/28/2022 2.50 0.94 0.2 46682.515 5882259 0.229131
6248 Denmark 12/29/2022 2.50 0.94 0.2 46682.515 5882259 0.229131

2096 rows × 8 columns

In [281]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [282]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [283]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[283]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [284]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [285]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [286]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [287]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [288]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[288]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [289]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [290]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [291]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [292]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984852360468537
In [293]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [294]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002011261480005173
R2 Score: 0.9983301990535706
RMSE: 0.044847
Entropy Value: 0.0005490250491042806
In [295]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[295]:
feature importance
1 human_development_index 0.848978
0 hospital_beds_per_thousand 0.066612
3 gdp_per_capita 0.039738
2 extreme_poverty 0.037550
4 population 0.007122
In [296]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[296]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [297]:
country1 = 'France'
country2 = 'Portugal'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [298]:
df_updated
Out[298]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
8376 France 1/24/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8377 France 1/25/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8378 France 1/26/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8379 France 1/27/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
8380 France 1/28/2020 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.000000
... ... ... ... ... ... ... ... ... ... ...
11513 Portugal 12/25/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11514 Portugal 12/26/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11515 Portugal 12/27/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11516 Portugal 12/28/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11517 Portugal 12/29/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977

2105 rows × 10 columns

In [299]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [300]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [301]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[301]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [302]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [303]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [304]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [305]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [306]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[306]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [307]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [308]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [309]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [310]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9957831690613299
In [311]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [312]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.047477141309076605
R2 Score: 0.9958871286533991
RMSE: 0.217892
Entropy Value: 0.0012934258184287219
In [313]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[313]:
feature importance
1 diabetes_prevalence 0.565073
0 cardiovasc_death_rate 0.373921
6 median_age 0.035919
2 female_smokers 0.010778
3 male_smokers 0.005120
4 life_expectancy 0.004886
5 aged_65_older 0.004303
In [314]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[314]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [315]:
country1 = 'France'
country2 = 'Portugal'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [316]:
df_updated
Out[316]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
8376 France 1/24/2020 5.98 0.901 0.02 38605.671 67813000 0.000000
8377 France 1/25/2020 5.98 0.901 0.02 38605.671 67813000 0.000000
8378 France 1/26/2020 5.98 0.901 0.02 38605.671 67813000 0.000000
8379 France 1/27/2020 5.98 0.901 0.02 38605.671 67813000 0.000000
8380 France 1/28/2020 5.98 0.901 0.02 38605.671 67813000 0.000000
... ... ... ... ... ... ... ... ...
11513 Portugal 12/25/2022 3.39 0.864 0.50 27936.896 10270857 0.462977
11514 Portugal 12/26/2022 3.39 0.864 0.50 27936.896 10270857 0.462977
11515 Portugal 12/27/2022 3.39 0.864 0.50 27936.896 10270857 0.462977
11516 Portugal 12/28/2022 3.39 0.864 0.50 27936.896 10270857 0.462977
11517 Portugal 12/29/2022 3.39 0.864 0.50 27936.896 10270857 0.462977

2105 rows × 8 columns

In [317]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [318]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [319]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[319]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [320]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [321]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [322]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [323]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [324]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[324]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [325]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [326]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [327]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [328]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9931385478865573
In [329]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [330]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0845928198808711
R2 Score: 0.9926718548037413
RMSE: 0.290848
Entropy Value: 0.004828902175062184
In [331]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[331]:
feature importance
1 human_development_index 0.615555
4 population 0.159129
0 hospital_beds_per_thousand 0.114584
2 extreme_poverty 0.090604
3 gdp_per_capita 0.020128
In [332]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[332]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [333]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [334]:
df_updated
Out[334]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2091 rows × 10 columns

In [335]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [336]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [337]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[337]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [338]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [339]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [340]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [341]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [342]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[342]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [343]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [344]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [345]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [346]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984789767307959
In [347]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [348]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0027083668308768272
R2 Score: 0.9986712195046868
RMSE: 0.052042
Entropy Value: 0.00047135693736216383
In [349]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[349]:
feature importance
6 median_age 0.593303
1 diabetes_prevalence 0.295471
0 cardiovasc_death_rate 0.053646
5 aged_65_older 0.050607
4 life_expectancy 0.003674
2 female_smokers 0.002363
3 male_smokers 0.000937
In [350]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[350]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [351]:
country1 = 'Slovakia'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [352]:
df_updated
Out[352]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 0.7 30155.152 5643455 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 0.7 30155.152 5643455 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 0.7 30155.152 5643455 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 0.7 30155.152 5643455 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 0.7 30155.152 5643455 0.000000
... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 31400.840 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 31400.840 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 31400.840 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 31400.840 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 31400.840 2119843 0.536669

2091 rows × 8 columns

In [353]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [354]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [355]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[355]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [356]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [357]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [358]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [359]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [360]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[360]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [361]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [362]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [363]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [364]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9977419513278931
In [365]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [366]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00345193732286861
R2 Score: 0.9983064085214091
RMSE: 0.058753
Entropy Value: 0.0008641336305743049
In [367]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[367]:
feature importance
1 human_development_index 0.766836
0 hospital_beds_per_thousand 0.123667
2 extreme_poverty 0.091805
3 gdp_per_capita 0.015496
4 population 0.002195
In [368]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[368]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [369]:
country1 = 'Belgium'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [370]:
df_updated
Out[370]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1040 Belgium 2/5/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1041 Belgium 2/6/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1042 Belgium 2/7/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1043 Belgium 2/8/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
... ... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2124 rows × 10 columns

In [371]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [372]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [373]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[373]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [374]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [375]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [376]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [377]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [378]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[378]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [379]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [380]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [381]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [382]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9992146796046744
In [383]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [384]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.017406911489324793
R2 Score: 0.9991022590867014
RMSE: 0.131935
Entropy Value: 0.00043772807452064234
In [385]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[385]:
feature importance
0 cardiovasc_death_rate 0.549617
5 aged_65_older 0.265471
1 diabetes_prevalence 0.155939
2 female_smokers 0.025952
6 median_age 0.001582
3 male_smokers 0.001153
4 life_expectancy 0.000287
In [386]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[386]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [387]:
country1 = 'Belgium'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [388]:
df_updated
Out[388]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
1039 Belgium 2/4/2020 5.64 0.931 0.2 42658.576 11655923 0.000000
1040 Belgium 2/5/2020 5.64 0.931 0.2 42658.576 11655923 0.000000
1041 Belgium 2/6/2020 5.64 0.931 0.2 42658.576 11655923 0.000000
1042 Belgium 2/7/2020 5.64 0.931 0.2 42658.576 11655923 0.000000
1043 Belgium 2/8/2020 5.64 0.931 0.2 42658.576 11655923 0.000000
... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 35220.084 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 35220.084 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 35220.084 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 35220.084 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 35220.084 59037472 0.735109

2124 rows × 8 columns

In [389]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [390]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [391]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[391]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [392]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [393]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [394]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [395]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [396]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[396]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [397]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [398]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [399]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [400]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985734833226066
In [401]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [402]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.09706029058839453
R2 Score: 0.9949942300808909
RMSE: 0.311545
Entropy Value: 0.002419033420494509
In [403]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[403]:
feature importance
1 human_development_index 0.718673
3 gdp_per_capita 0.148969
2 extreme_poverty 0.112091
0 hospital_beds_per_thousand 0.011429
4 population 0.008838
In [404]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[404]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [405]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [406]:
df_updated
Out[406]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17796 Luxembourg 12/26/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17797 Luxembourg 12/27/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17798 Luxembourg 12/28/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17799 Luxembourg 12/29/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872

2078 rows × 10 columns

In [407]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [408]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [409]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[409]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [410]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [411]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [412]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [413]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [414]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[414]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [415]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [416]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [417]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [418]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991149035005054
In [419]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [420]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0032604762329485467
R2 Score: 0.9995750530257127
RMSE: 0.057101
Entropy Value: 0.0004307174874771804
In [421]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[421]:
feature importance
1 diabetes_prevalence 0.562829
0 cardiovasc_death_rate 0.297571
5 aged_65_older 0.104378
2 female_smokers 0.016723
6 median_age 0.015953
3 male_smokers 0.002131
4 life_expectancy 0.000416
In [422]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[422]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [423]:
country1 = 'Luxembourg'
country2 = 'Netherlands'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [424]:
df_updated
Out[424]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 48472.545 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 48472.545 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 48472.545 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 48472.545 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 48472.545 17564020 0.000000
... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.51 0.916 0.2 94277.965 647601 0.377872
17796 Luxembourg 12/26/2022 4.51 0.916 0.2 94277.965 647601 0.377872
17797 Luxembourg 12/27/2022 4.51 0.916 0.2 94277.965 647601 0.377872
17798 Luxembourg 12/28/2022 4.51 0.916 0.2 94277.965 647601 0.377872
17799 Luxembourg 12/29/2022 4.51 0.916 0.2 94277.965 647601 0.377872

2078 rows × 8 columns

In [425]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [426]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [427]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[427]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [428]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [429]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [430]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [431]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [432]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[432]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [433]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [434]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [435]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [436]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9977757658046478
In [437]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [438]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009316482430350636
R2 Score: 0.998785756822954
RMSE: 0.096522
Entropy Value: 0.001636704915774484
In [439]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[439]:
feature importance
1 human_development_index 0.846066
2 extreme_poverty 0.097140
0 hospital_beds_per_thousand 0.048313
3 gdp_per_capita 0.007219
4 population 0.001262
In [440]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[440]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [441]:
country1 = 'Switzerland'
country2 = 'United Kingdom'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [442]:
df_updated
Out[442]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322922
14645 Switzerland 12/26/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322922
14646 Switzerland 12/27/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322922
14647 Switzerland 12/28/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.323082
14648 Switzerland 12/29/2022 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.322149

2102 rows × 10 columns

In [443]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [444]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [445]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[445]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [446]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [447]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [448]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [449]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [450]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[450]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [451]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [452]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [453]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [454]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9534896177522434
In [455]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [456]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  3.3003716461415755
R2 Score: 0.8696787142284871
RMSE: 1.816693
Entropy Value: 0.010589221412772479
In [457]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[457]:
feature importance
0 cardiovasc_death_rate 0.453094
1 diabetes_prevalence 0.312006
2 female_smokers 0.152936
6 median_age 0.034726
5 aged_65_older 0.026498
4 life_expectancy 0.015497
3 male_smokers 0.005243
In [458]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[458]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [459]:
country1 = 'Switzerland'
country2 = 'United Kingdom'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [460]:
df_updated
Out[460]:
location date hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.932 0.20 39753.244 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.932 0.20 39753.244 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.932 0.20 39753.244 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.932 0.20 39753.244 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.932 0.20 39753.244 67508936 22.222222
... ... ... ... ... ... ... ... ...
14644 Switzerland 12/25/2022 4.53 0.955 0.03 57410.166 8740471 0.322922
14645 Switzerland 12/26/2022 4.53 0.955 0.03 57410.166 8740471 0.322922
14646 Switzerland 12/27/2022 4.53 0.955 0.03 57410.166 8740471 0.322922
14647 Switzerland 12/28/2022 4.53 0.955 0.03 57410.166 8740471 0.323082
14648 Switzerland 12/29/2022 4.53 0.955 0.03 57410.166 8740471 0.322149

2102 rows × 8 columns

In [461]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [462]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [463]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[463]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [464]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [465]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population'])
principal_df['location'] = df_updated['location'].values
In [466]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [467]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [468]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[468]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [469]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [470]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [471]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [472]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9514721454520239
In [473]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [474]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  1.512484015018286
R2 Score: 0.9402767679886957
RMSE: 1.229831
Entropy Value: 0.0070634068971487385
In [475]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[475]:
feature importance
1 human_development_index 0.470955
2 extreme_poverty 0.184488
4 population 0.141774
0 hospital_beds_per_thousand 0.118008
3 gdp_per_capita 0.084775
In [2]:
# Country Pair by Pair Analysis relative to hospital beds per thousand
In [3]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[3]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [4]:
# Showing the pairings of countries based on hospital beds per thousand (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Bulgaria = df[(df.location == "Bulgaria")]

df_Czechia = df[(df.location == "Czechia")]
df_France = df[(df.location == "France")]

df_Romania = df[(df.location == "Romania")]
df_Slovakia = df[(df.location == "Slovakia")]

df_Belgium = df[(df.location == "Belgium")]
df_Estonia = df[(df.location == "Estonia")]

df_Latvia = df[(df.location == "Latvia")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Serbia = df[(df.location == "Serbia")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Switzerland = df[(df.location == "Switzerland")]
df_Canada = df[(df.location == "Canada")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Denmark = df[(df.location == "Denmark")]

df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Italy = df[(df.location == "Italy")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Portugal = df[(df.location == "Portugal")]

df_Spain = df[(df.location == "Spain")]
df_Sweden = df[(df.location == "Sweden")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]
In [5]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [6]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [7]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[7]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [8]:
country1 = 'Austria'
country2 = 'Bulgaria'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [9]:
df_updated
Out[9]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ... ...
3121 Bulgaria 12/25/2022 424.688 5.81 30.1 44.4 75.05 20.801 44.7 2.949845
3122 Bulgaria 12/26/2022 424.688 5.81 30.1 44.4 75.05 20.801 44.7 2.950107
3123 Bulgaria 12/27/2022 424.688 5.81 30.1 44.4 75.05 20.801 44.7 2.949883
3124 Bulgaria 12/28/2022 424.688 5.81 30.1 44.4 75.05 20.801 44.7 2.949716
3125 Bulgaria 12/29/2022 424.688 5.81 30.1 44.4 75.05 20.801 44.7 2.949605

2066 rows × 10 columns

In [10]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [11]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [12]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[12]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [13]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [14]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [15]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [16]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [17]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[17]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [18]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [19]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [20]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [21]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'subsample': 0.8}
Best CV score: 0.9390150883353605
In [22]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [23]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0044848062013056154
R2 Score: 0.9975309010841117
RMSE: 0.066969
Entropy Value: 0.000715458116492232
In [24]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[24]:
feature importance
5 aged_65_older 0.306596
0 cardiovasc_death_rate 0.216755
1 diabetes_prevalence 0.208843
6 median_age 0.148879
2 female_smokers 0.054899
3 male_smokers 0.042820
4 life_expectancy 0.021207
In [25]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[25]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [26]:
country1 = 'Austria'
country2 = 'Bulgaria'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [27]:
df_updated
Out[27]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 0.922 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 0.922 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 0.922 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 0.922 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 0.922 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ...
3121 Bulgaria 12/25/2022 0.816 1.5 18563.307 65.180 6781955 2.949845
3122 Bulgaria 12/26/2022 0.816 1.5 18563.307 65.180 6781955 2.950107
3123 Bulgaria 12/27/2022 0.816 1.5 18563.307 65.180 6781955 2.949883
3124 Bulgaria 12/28/2022 0.816 1.5 18563.307 65.180 6781955 2.949716
3125 Bulgaria 12/29/2022 0.816 1.5 18563.307 65.180 6781955 2.949605

2066 rows × 8 columns

In [28]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [29]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [30]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[30]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [31]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [32]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [33]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [34]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [35]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[35]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [36]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [37]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [38]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [39]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9274228684091804
In [40]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [41]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004503341096560387
R2 Score: 0.9975206967435615
RMSE: 0.067107
Entropy Value: 0.0004825642819307405
In [42]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[42]:
feature importance
0 human_development_index 0.710086
1 extreme_poverty 0.179048
4 population 0.041480
2 gdp_per_capita 0.040024
3 population_density 0.029363
In [43]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[43]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [44]:
country1 = 'Czechia'
country2 = 'France'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [45]:
df_updated
Out[45]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
4153 Czechia 3/1/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4154 Czechia 3/2/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4155 Czechia 3/3/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4156 Czechia 3/4/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
4157 Czechia 3/5/2020 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411710
9443 France 12/26/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411282
9444 France 12/27/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411730
9445 France 12/28/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411813
9446 France 12/29/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411892

2105 rows × 10 columns

In [46]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [47]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [48]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[48]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [49]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [50]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [51]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [52]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [53]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[53]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [54]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [55]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [56]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [57]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9958649994549903
In [58]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [59]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.09823454258048223
R2 Score: 0.9905286905291042
RMSE: 0.313424
Entropy Value: 0.001940918275872383
In [60]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[60]:
feature importance
0 cardiovasc_death_rate 0.486330
5 aged_65_older 0.411606
1 diabetes_prevalence 0.081625
3 male_smokers 0.006524
2 female_smokers 0.006491
6 median_age 0.005829
4 life_expectancy 0.001595
In [61]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[61]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [62]:
country1 = 'Czechia'
country2 = 'France'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [63]:
df_updated
Out[63]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
4153 Czechia 3/1/2020 0.900 0.00 32605.906 137.176 10493990 0.000000
4154 Czechia 3/2/2020 0.900 0.00 32605.906 137.176 10493990 0.000000
4155 Czechia 3/3/2020 0.900 0.00 32605.906 137.176 10493990 0.000000
4156 Czechia 3/4/2020 0.900 0.00 32605.906 137.176 10493990 0.000000
4157 Czechia 3/5/2020 0.900 0.00 32605.906 137.176 10493990 0.000000
... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 0.901 0.02 38605.671 122.578 67813000 0.411710
9443 France 12/26/2022 0.901 0.02 38605.671 122.578 67813000 0.411282
9444 France 12/27/2022 0.901 0.02 38605.671 122.578 67813000 0.411730
9445 France 12/28/2022 0.901 0.02 38605.671 122.578 67813000 0.411813
9446 France 12/29/2022 0.901 0.02 38605.671 122.578 67813000 0.411892

2105 rows × 8 columns

In [64]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [65]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [66]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[66]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [67]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [68]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [69]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [70]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [71]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[71]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [72]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [73]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [74]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [75]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9945372413226707
In [76]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [77]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.06076319269700281
R2 Score: 0.9941415006640716
RMSE: 0.246502
Entropy Value: inf
In [78]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[78]:
feature importance
1 extreme_poverty 0.600011
4 population 0.135056
2 gdp_per_capita 0.131486
0 human_development_index 0.103245
3 population_density 0.030201
In [79]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[79]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [80]:
country1 = 'Romania'
country2 = 'Slovakia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [81]:
df_updated
Out[81]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 15.07 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 15.07 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 15.07 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 15.07 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 15.07 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 37.1 76.05 17.85 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 37.1 76.05 17.85 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 37.1 76.05 17.85 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 37.1 76.05 17.85 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 37.1 76.05 17.85 43.0 2.036403

2067 rows × 10 columns

In [82]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [83]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [84]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[84]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [85]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [86]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [87]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [88]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [89]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[89]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [90]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [91]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [92]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [93]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987386723526734
In [94]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [95]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0012543326875114565
R2 Score: 0.9992927222966834
RMSE: 0.035417
Entropy Value: 0.0001720479338483219
In [96]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[96]:
feature importance
0 cardiovasc_death_rate 0.533194
1 diabetes_prevalence 0.296478
5 aged_65_older 0.155072
2 female_smokers 0.010748
6 median_age 0.003793
3 male_smokers 0.000546
4 life_expectancy 0.000168
In [97]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[97]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [98]:
country1 = 'Romania'
country2 = 'Slovakia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [99]:
df_updated
Out[99]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 0.860 0.7 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 0.860 0.7 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 0.860 0.7 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 0.860 0.7 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 0.860 0.7 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 0.828 5.7 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 0.828 5.7 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 0.828 5.7 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 0.828 5.7 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 0.828 5.7 23313.199 85.129 19659270 2.036403

2067 rows × 8 columns

In [100]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [101]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [102]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[102]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [103]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [104]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [105]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [106]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [107]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[107]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [108]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [109]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [110]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [111]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9968518647339172
In [112]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [113]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0027447047876360095
R2 Score: 0.9984523495896989
RMSE: 0.052390
Entropy Value: inf
In [114]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[114]:
feature importance
0 human_development_index 0.742119
1 extreme_poverty 0.205208
2 gdp_per_capita 0.027011
3 population_density 0.021735
4 population 0.003927
In [115]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[115]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [116]:
country1 = 'Belgium'
country2 = 'Estonia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [117]:
df_updated
Out[117]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
1039 Belgium 2/4/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1040 Belgium 2/5/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1041 Belgium 2/6/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1042 Belgium 2/7/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
1043 Belgium 2/8/2020 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.000000
... ... ... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.464100
7306 Estonia 12/26/2022 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.464100
7307 Estonia 12/27/2022 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.463645
7308 Estonia 12/28/2022 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.466423
7309 Estonia 12/29/2022 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.466423

2121 rows × 10 columns

In [118]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [119]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [120]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[120]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [121]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [122]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [123]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [124]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [125]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[125]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [126]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [127]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [128]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [129]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989923831042263
In [130]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [131]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.013599164509749826
R2 Score: 0.9989226161535332
RMSE: 0.116615
Entropy Value: 0.0008484654832886927
In [132]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[132]:
feature importance
1 diabetes_prevalence 0.855940
0 cardiovasc_death_rate 0.105436
2 female_smokers 0.022113
5 aged_65_older 0.008940
6 median_age 0.007059
3 male_smokers 0.000455
4 life_expectancy 0.000058
In [133]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[133]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [134]:
country1 = 'Belgium'
country2 = 'Estonia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [135]:
df_updated
Out[135]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
1039 Belgium 2/4/2020 0.931 0.2 42658.576 375.564 11655923 0.000000
1040 Belgium 2/5/2020 0.931 0.2 42658.576 375.564 11655923 0.000000
1041 Belgium 2/6/2020 0.931 0.2 42658.576 375.564 11655923 0.000000
1042 Belgium 2/7/2020 0.931 0.2 42658.576 375.564 11655923 0.000000
1043 Belgium 2/8/2020 0.931 0.2 42658.576 375.564 11655923 0.000000
... ... ... ... ... ... ... ... ...
7305 Estonia 12/25/2022 0.892 0.5 29481.252 31.033 1326064 0.464100
7306 Estonia 12/26/2022 0.892 0.5 29481.252 31.033 1326064 0.464100
7307 Estonia 12/27/2022 0.892 0.5 29481.252 31.033 1326064 0.463645
7308 Estonia 12/28/2022 0.892 0.5 29481.252 31.033 1326064 0.466423
7309 Estonia 12/29/2022 0.892 0.5 29481.252 31.033 1326064 0.466423

2121 rows × 8 columns

In [136]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [137]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [138]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[138]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [139]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [140]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [141]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [142]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [143]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[143]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [144]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [145]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [146]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [147]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9981079962252715
In [148]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [149]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.04174307689414808
R2 Score: 0.9966929353111853
RMSE: 0.204311
Entropy Value: inf
In [150]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[150]:
feature importance
1 extreme_poverty 0.595773
2 gdp_per_capita 0.261597
0 human_development_index 0.106863
3 population_density 0.034785
4 population 0.000981
In [151]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[151]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [152]:
country1 = 'Latvia'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [153]:
df_updated
Out[153]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
16759 Luxembourg 2/12/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16760 Luxembourg 2/24/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16761 Luxembourg 2/25/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16762 Luxembourg 2/26/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16763 Luxembourg 2/27/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631969

2079 rows × 10 columns

In [154]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [155]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [156]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[156]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [157]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [158]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [159]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [160]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [161]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[161]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [162]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [163]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [164]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [165]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9988494053319295
In [166]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [167]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0014364972839239708
R2 Score: 0.9963828762121265
RMSE: 0.037901
Entropy Value: 0.0004385904063229603
In [168]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[168]:
feature importance
1 diabetes_prevalence 0.760923
6 median_age 0.172695
0 cardiovasc_death_rate 0.035559
2 female_smokers 0.015432
5 aged_65_older 0.010307
3 male_smokers 0.004540
4 life_expectancy 0.000544
In [169]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[169]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [170]:
country1 = 'Latvia'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [171]:
df_updated
Out[171]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
16759 Luxembourg 2/12/2020 0.916 0.2 94277.965 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 0.916 0.2 94277.965 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 0.916 0.2 94277.965 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 0.916 0.2 94277.965 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 0.916 0.2 94277.965 231.447 647601 0.000000
... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 0.866 0.7 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 0.866 0.7 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 0.866 0.7 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 0.866 0.7 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 0.866 0.7 25063.846 31.212 1850654 0.631969

2079 rows × 8 columns

In [172]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [173]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [174]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[174]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [175]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [176]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [177]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [178]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [179]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[179]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [180]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [181]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [182]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [183]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9974607843120801
In [184]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [185]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0028344280543466877
R2 Score: 0.9928628635395763
RMSE: 0.053239
Entropy Value: 0.00104898720029077
In [186]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[186]:
feature importance
1 extreme_poverty 0.696631
2 gdp_per_capita 0.117422
3 population_density 0.094589
0 human_development_index 0.082520
4 population 0.008839
In [187]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[187]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [188]:
country1 = 'Serbia'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [189]:
df_updated
Out[189]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
15721 Serbia 2/26/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15722 Serbia 2/27/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15723 Serbia 2/28/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15724 Serbia 2/29/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15725 Serbia 3/1/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2100 rows × 10 columns

In [190]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [191]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [192]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[192]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [193]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [194]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [195]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [196]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [197]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[197]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [198]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [199]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [200]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [201]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979858035568876
In [202]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [203]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002574183795502055
R2 Score: 0.9985128745675512
RMSE: 0.050736
Entropy Value: 0.0008625859343290145
In [204]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[204]:
feature importance
1 diabetes_prevalence 0.477247
6 median_age 0.226017
0 cardiovasc_death_rate 0.198241
5 aged_65_older 0.067375
2 female_smokers 0.018691
3 male_smokers 0.008959
4 life_expectancy 0.003470
In [205]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[205]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [206]:
country1 = 'Serbia'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [207]:
df_updated
Out[207]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
15721 Serbia 2/26/2020 0.806 0.05 14048.881 80.291 6871547 0.000000
15722 Serbia 2/27/2020 0.806 0.05 14048.881 80.291 6871547 0.000000
15723 Serbia 2/28/2020 0.806 0.05 14048.881 80.291 6871547 0.000000
15724 Serbia 2/29/2020 0.806 0.05 14048.881 80.291 6871547 0.000000
15725 Serbia 3/1/2020 0.806 0.05 14048.881 80.291 6871547 0.000000
... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 0.917 0.00 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 0.917 0.00 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 0.917 0.00 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 0.917 0.00 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 0.917 0.00 31400.840 102.619 2119843 0.536669

2100 rows × 8 columns

In [208]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [209]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [210]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[210]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [211]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [212]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [213]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [214]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [215]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[215]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [216]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [217]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [218]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [219]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9980947692896143
In [220]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [221]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004439908764146692
R2 Score: 0.9974350311533886
RMSE: 0.066633
Entropy Value: 0.001278250012959715
In [222]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[222]:
feature importance
1 extreme_poverty 0.582591
0 human_development_index 0.184401
2 gdp_per_capita 0.111431
4 population 0.074040
3 population_density 0.047537
In [223]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[223]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [224]:
country1 = 'Switzerland'
country2 = 'Canada'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [225]:
df_updated
Out[225]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2111 rows × 10 columns

In [226]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [227]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [228]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[228]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [229]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [230]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [231]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [232]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [233]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[233]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [234]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [235]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [236]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [237]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9993402391390793
In [238]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [239]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004670392095474607
R2 Score: 0.9985772312539217
RMSE: 0.068340
Entropy Value: 0.0007126326576069527
In [240]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[240]:
feature importance
1 diabetes_prevalence 0.860369
0 cardiovasc_death_rate 0.077197
5 aged_65_older 0.022825
6 median_age 0.020826
2 female_smokers 0.017709
3 male_smokers 0.000843
4 life_expectancy 0.000231
In [241]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[241]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [242]:
country1 = 'Switzerland'
country2 = 'Canada'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [243]:
df_updated
Out[243]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 0.955 0.03 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 0.955 0.03 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 0.955 0.03 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 0.955 0.03 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 0.955 0.03 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 0.929 0.50 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 0.929 0.50 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 0.929 0.50 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 0.929 0.50 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 0.929 0.50 44017.591 4.037 38454328 1.093162

2111 rows × 8 columns

In [244]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [245]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [246]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[246]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [247]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [248]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [249]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [250]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [251]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[251]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [252]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [253]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [254]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [255]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9982663840319784
In [256]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [257]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008554909392847082
R2 Score: 0.9973938681248052
RMSE: 0.092493
Entropy Value: 0.0016355829056392212
In [258]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[258]:
feature importance
1 extreme_poverty 0.725877
2 gdp_per_capita 0.104252
3 population_density 0.097611
0 human_development_index 0.063116
4 population 0.009144
In [259]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[259]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [260]:
country1 = 'Cyprus'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [261]:
df_updated
Out[261]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.227772
6245 Denmark 12/26/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.227772
6246 Denmark 12/27/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.228905
6247 Denmark 12/28/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.229131
6248 Denmark 12/29/2022 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.229131

2089 rows × 10 columns

In [262]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [263]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [264]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[264]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [265]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [266]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [267]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [268]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [269]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[269]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [270]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [271]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [272]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [273]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9981664102171948
In [274]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [275]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.000955806320139703
R2 Score: 0.9992193301724611
RMSE: 0.030916
Entropy Value: 0.0005947775597474582
In [276]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[276]:
feature importance
6 median_age 0.471901
5 aged_65_older 0.362727
1 diabetes_prevalence 0.138179
0 cardiovasc_death_rate 0.013290
2 female_smokers 0.007320
3 male_smokers 0.003781
4 life_expectancy 0.002802
In [277]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[277]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [278]:
country1 = 'Cyprus'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [279]:
df_updated
Out[279]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 0.887 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 0.887 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 0.887 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 0.887 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 0.887 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ...
6244 Denmark 12/25/2022 0.940 0.20 46682.515 136.520 5882259 0.227772
6245 Denmark 12/26/2022 0.940 0.20 46682.515 136.520 5882259 0.227772
6246 Denmark 12/27/2022 0.940 0.20 46682.515 136.520 5882259 0.228905
6247 Denmark 12/28/2022 0.940 0.20 46682.515 136.520 5882259 0.229131
6248 Denmark 12/29/2022 0.940 0.20 46682.515 136.520 5882259 0.229131

2089 rows × 8 columns

In [280]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [281]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [282]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[282]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [283]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [284]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [285]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [286]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [287]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[287]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [288]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [289]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [290]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [291]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9978802115578003
In [292]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [293]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002391339100663906
R2 Score: 0.998046836221977
RMSE: 0.048901
Entropy Value: 0.001094689244735057
In [294]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[294]:
feature importance
1 extreme_poverty 0.823494
0 human_development_index 0.106954
2 gdp_per_capita 0.050921
3 population_density 0.013679
4 population 0.004952
In [295]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[295]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [296]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [297]:
df_updated
Out[297]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
7310 Finland 1/29/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7311 Finland 1/30/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7312 Finland 1/31/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7313 Finland 2/1/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7314 Finland 2/2/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
... ... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011

2102 rows × 10 columns

In [298]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [299]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [300]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[300]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [301]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [302]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [303]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [304]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [305]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[305]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [306]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [307]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [308]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [309]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9967115293459013
In [310]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [311]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002631359506918055
R2 Score: 0.9977846875396988
RMSE: 0.051297
Entropy Value: 0.0008231147347694647
In [312]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[312]:
feature importance
1 diabetes_prevalence 0.424154
0 cardiovasc_death_rate 0.346029
2 female_smokers 0.087274
5 aged_65_older 0.081840
6 median_age 0.050001
3 male_smokers 0.008944
4 life_expectancy 0.001758
In [313]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[313]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [314]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [315]:
df_updated
Out[315]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
7310 Finland 1/29/2020 0.938 0.04 40585.721 18.136 5540745 0.00000
7311 Finland 1/30/2020 0.938 0.04 40585.721 18.136 5540745 0.00000
7312 Finland 1/31/2020 0.938 0.04 40585.721 18.136 5540745 0.00000
7313 Finland 2/1/2020 0.938 0.04 40585.721 18.136 5540745 0.00000
7314 Finland 2/2/2020 0.938 0.04 40585.721 18.136 5540745 0.00000
... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 0.949 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 0.949 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 0.949 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 0.949 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 0.949 0.20 46482.958 3.404 372903 0.11011

2102 rows × 8 columns

In [316]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [317]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [318]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[318]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [319]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [320]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [321]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [322]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [323]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[323]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [324]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [325]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [326]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [327]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9954359249296372
In [328]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [329]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004634197824994556
R2 Score: 0.9960985201154687
RMSE: 0.068075
Entropy Value: 0.0016279455279392522
In [330]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[330]:
feature importance
1 extreme_poverty 0.567476
2 gdp_per_capita 0.207296
0 human_development_index 0.151705
3 population_density 0.054498
4 population 0.019025
In [331]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[331]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [332]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [333]:
df_updated
Out[333]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18839 Ireland 3/1/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18840 Ireland 3/2/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18841 Ireland 3/3/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18842 Ireland 3/4/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2099 rows × 10 columns

In [334]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [335]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [336]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[336]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [337]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [338]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [339]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [340]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [341]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[341]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [342]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [343]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [344]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [345]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9992082203444431
In [346]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [347]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008136783743045974
R2 Score: 0.9993315447885287
RMSE: 0.090204
Entropy Value: 0.0003728532754072914
In [348]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[348]:
feature importance
5 aged_65_older 0.368700
1 diabetes_prevalence 0.257725
0 cardiovasc_death_rate 0.220660
6 median_age 0.135502
2 female_smokers 0.015439
4 life_expectancy 0.001092
3 male_smokers 0.000882
In [349]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[349]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [350]:
country1 = 'Ireland'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [351]:
df_updated
Out[351]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
18838 Ireland 2/29/2020 0.955 0.2 67335.293 69.874 5023108 0.000000
18839 Ireland 3/1/2020 0.955 0.2 67335.293 69.874 5023108 0.000000
18840 Ireland 3/2/2020 0.955 0.2 67335.293 69.874 5023108 0.000000
18841 Ireland 3/3/2020 0.955 0.2 67335.293 69.874 5023108 0.000000
18842 Ireland 3/4/2020 0.955 0.2 67335.293 69.874 5023108 0.000000
... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 0.892 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 0.892 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 0.892 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 0.892 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 0.892 2.0 35220.084 205.859 59037472 0.735109

2099 rows × 8 columns

In [352]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [353]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [354]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[354]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [355]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [356]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [357]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [358]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [359]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[359]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [360]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [361]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [362]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [363]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9977141803352069
In [364]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [365]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.022014288790731012
R2 Score: 0.9981914763211477
RMSE: 0.148372
Entropy Value: 0.0014257512531825832
In [366]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[366]:
feature importance
1 extreme_poverty 0.605823
0 human_development_index 0.196309
3 population_density 0.107973
2 gdp_per_capita 0.076505
4 population 0.013390
In [367]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[367]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [368]:
country1 = 'Netherlands'
country2 = 'Portugal'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [369]:
df_updated
Out[369]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
11513 Portugal 12/25/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11514 Portugal 12/26/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11515 Portugal 12/27/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11516 Portugal 12/28/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977
11517 Portugal 12/29/2022 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.462977

2071 rows × 10 columns

In [370]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [371]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [372]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[372]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [373]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [374]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [375]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [376]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [377]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[377]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [378]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [379]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [380]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [381]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9990692627631146
In [382]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [383]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0038690919886303294
R2 Score: 0.9994973666749689
RMSE: 0.062202
Entropy Value: 0.00024895544762980527
In [384]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[384]:
feature importance
1 diabetes_prevalence 0.506845
0 cardiovasc_death_rate 0.422379
6 median_age 0.036212
2 female_smokers 0.026778
3 male_smokers 0.004003
5 aged_65_older 0.003742
4 life_expectancy 0.000041
In [385]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[385]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [386]:
country1 = 'Netherlands'
country2 = 'Portugal'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [387]:
df_updated
Out[387]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 0.944 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 0.944 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 0.944 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 0.944 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 0.944 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ...
11513 Portugal 12/25/2022 0.864 0.5 27936.896 112.371 10270857 0.462977
11514 Portugal 12/26/2022 0.864 0.5 27936.896 112.371 10270857 0.462977
11515 Portugal 12/27/2022 0.864 0.5 27936.896 112.371 10270857 0.462977
11516 Portugal 12/28/2022 0.864 0.5 27936.896 112.371 10270857 0.462977
11517 Portugal 12/29/2022 0.864 0.5 27936.896 112.371 10270857 0.462977

2071 rows × 8 columns

In [388]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [389]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [390]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[390]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [391]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [392]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [393]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [394]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [395]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[395]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [396]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [397]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [398]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [399]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9978978820744411
In [400]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [401]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.011370657753647056
R2 Score: 0.9985228390714666
RMSE: 0.106633
Entropy Value: 0.0014961642094780694
In [402]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[402]:
feature importance
1 extreme_poverty 0.691019
2 gdp_per_capita 0.177634
0 human_development_index 0.087000
3 population_density 0.041211
4 population 0.003136
In [403]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[403]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [404]:
country1 = 'Spain'
country2 = 'Sweden'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [405]:
df_updated
Out[405]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
23011 Sweden 2/1/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23012 Sweden 2/2/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23013 Sweden 2/3/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23014 Sweden 2/4/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
23015 Sweden 2/5/2020 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.000000
... ... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148

2126 rows × 10 columns

In [406]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [407]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [408]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[408]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [409]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [410]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [411]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [412]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [413]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[413]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [414]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [415]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [416]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [417]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989620255205065
In [418]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [419]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.013116878471431913
R2 Score: 0.9984614298768997
RMSE: 0.114529
Entropy Value: 0.0007355015755679387
In [420]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[420]:
feature importance
1 diabetes_prevalence 0.917876
2 female_smokers 0.040045
0 cardiovasc_death_rate 0.020071
6 median_age 0.012055
5 aged_65_older 0.006507
3 male_smokers 0.002973
4 life_expectancy 0.000473
In [421]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[421]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [422]:
country1 = 'Spain'
country2 = 'Sweden'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [423]:
df_updated
Out[423]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
23011 Sweden 2/1/2020 0.945 0.5 46949.283 24.718 10549349 0.000000
23012 Sweden 2/2/2020 0.945 0.5 46949.283 24.718 10549349 0.000000
23013 Sweden 2/3/2020 0.945 0.5 46949.283 24.718 10549349 0.000000
23014 Sweden 2/4/2020 0.945 0.5 46949.283 24.718 10549349 0.000000
23015 Sweden 2/5/2020 0.945 0.5 46949.283 24.718 10549349 0.000000
... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 0.904 1.0 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 0.904 1.0 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 0.904 1.0 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 0.904 1.0 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 0.904 1.0 34272.360 93.105 47558632 0.855148

2126 rows × 8 columns

In [424]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [425]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [426]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[426]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [427]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [428]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [429]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [430]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [431]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[431]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [432]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [433]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [434]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [435]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987146553286387
In [436]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [437]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01981169414057557
R2 Score: 0.9976761482726947
RMSE: 0.140754
Entropy Value: 0.0009592747648842666
In [438]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[438]:
feature importance
1 extreme_poverty 0.926065
2 gdp_per_capita 0.056652
3 population_density 0.011133
0 human_development_index 0.005290
4 population 0.000861
In [439]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[439]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [440]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [441]:
df_updated
Out[441]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 10 columns

In [442]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [443]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [444]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[444]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [445]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [446]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [447]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [448]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [449]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[449]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [450]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [451]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [452]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [453]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9573756200168303
In [454]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [455]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  3.067522668688002
R2 Score: 0.8735128665771991
RMSE: 1.751434
Entropy Value: 0.00997828364305316
In [456]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[456]:
feature importance
0 cardiovasc_death_rate 0.515435
1 diabetes_prevalence 0.265632
2 female_smokers 0.128937
5 aged_65_older 0.040833
6 median_age 0.026848
4 life_expectancy 0.013474
3 male_smokers 0.008841
In [457]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[457]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [458]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [459]:
df_updated
Out[459]:
location date human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 0.932 0.2 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 0.932 0.2 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 0.932 0.2 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 0.932 0.2 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 0.932 0.2 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 0.926 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 0.926 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 0.926 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 0.926 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 0.926 1.2 54225.446 35.608 338289856 1.084791

2136 rows × 8 columns

In [460]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [461]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [462]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[462]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [463]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [464]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [465]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [466]:
selected_cols = ['human_development_index', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [467]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[467]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [468]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [469]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [470]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [471]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9551791807835526
In [472]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [473]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  1.4775238366856351
R2 Score: 0.9390753468347937
RMSE: 1.215534
Entropy Value: inf
In [474]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[474]:
feature importance
1 extreme_poverty 0.360283
4 population 0.281991
0 human_development_index 0.175041
2 gdp_per_capita 0.135475
3 population_density 0.047210
In [84]:
# Country Pair by Pair Analysis relative to human development index
In [85]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[85]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [86]:
# Showing the pairings of countries based on human development index (13 pairs of countries)
df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]

df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]

df_Finland = df[(df.location == "Finland")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Slovenia = df[(df.location == "Slovenia")]

df_Sweden = df[(df.location == "Sweden")]
df_Switzerland = df[(df.location == "Switzerland")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_UnitedStates = df[(df.location == "United States")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]

df_Estonia = df[(df.location == "Estonia")]
df_France = df[(df.location == "France")]

df_Italy = df[(df.location == "Italy")]
df_Latvia = df[(df.location == "Latvia")]

df_Portugal = df[(df.location == "Portugal")]
df_Slovakia = df[(df.location == "Slovakia")]

df_Spain = df[(df.location == "Spain")]
df_Bulgaria = df[(df.location == "Bulgaria")]

df_Romania = df[(df.location == "Romania")]
df_Serbia = df[(df.location == "Serbia")]
In [87]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [88]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [89]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[89]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [481]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [482]:
df_updated
Out[482]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2095 Belgium 12/26/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2096 Belgium 12/27/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2097 Belgium 12/28/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2098 Belgium 12/29/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787

2099 rows × 10 columns

In [483]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [484]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [485]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[485]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [486]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [487]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [488]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [489]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [490]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[490]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [491]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [492]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [493]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [494]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985877971766527
In [495]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [496]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005399476083504289
R2 Score: 0.9995439543541608
RMSE: 0.073481
Entropy Value: 0.0003649161068700861
In [497]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[497]:
feature importance
6 median_age 0.846396
0 cardiovasc_death_rate 0.062219
1 diabetes_prevalence 0.061911
5 aged_65_older 0.025860
2 female_smokers 0.002106
3 male_smokers 0.001366
4 life_expectancy 0.000142
In [498]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[498]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [499]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [500]:
df_updated
Out[500]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.7 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.7 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.7 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.7 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.7 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 5.64 0.2 42658.576 375.564 11655923 0.711787
2095 Belgium 12/26/2022 5.64 0.2 42658.576 375.564 11655923 0.711787
2096 Belgium 12/27/2022 5.64 0.2 42658.576 375.564 11655923 0.711787
2097 Belgium 12/28/2022 5.64 0.2 42658.576 375.564 11655923 0.711787
2098 Belgium 12/29/2022 5.64 0.2 42658.576 375.564 11655923 0.711787

2099 rows × 8 columns

In [501]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [502]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [503]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[503]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [504]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [505]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [506]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [507]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [508]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[508]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [509]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [510]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [511]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [512]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979482681643022
In [513]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [514]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.017351533220202466
R2 Score: 0.9985344705576373
RMSE: 0.131725
Entropy Value: 0.0015777407587553624
In [515]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[515]:
feature importance
1 extreme_poverty 0.700511
2 gdp_per_capita 0.139220
0 hospital_beds_per_thousand 0.123668
3 population_density 0.032006
4 population 0.004595
In [516]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[516]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [517]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [518]:
df_updated
Out[518]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5188 Denmark 2/3/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5189 Denmark 2/4/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5190 Denmark 2/5/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5191 Denmark 2/6/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2134 rows × 10 columns

In [519]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [520]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [521]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[521]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [522]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [523]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [524]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [525]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [526]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[526]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [527]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [528]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [529]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [530]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992606850560544
In [531]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [532]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0025729522227882153
R2 Score: 0.9993859368044151
RMSE: 0.050724
Entropy Value: 0.00030984264476932686
In [533]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[533]:
feature importance
1 diabetes_prevalence 0.665696
6 median_age 0.173524
0 cardiovasc_death_rate 0.127952
5 aged_65_older 0.019835
2 female_smokers 0.011763
3 male_smokers 0.001135
4 life_expectancy 0.000095
In [534]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[534]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [535]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [536]:
df_updated
Out[536]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.5 0.2 46682.515 136.520 5882259 0.000000
5188 Denmark 2/3/2020 2.5 0.2 46682.515 136.520 5882259 0.000000
5189 Denmark 2/4/2020 2.5 0.2 46682.515 136.520 5882259 0.000000
5190 Denmark 2/5/2020 2.5 0.2 46682.515 136.520 5882259 0.000000
5191 Denmark 2/6/2020 2.5 0.2 46682.515 136.520 5882259 0.000000
... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.5 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.5 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.5 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.5 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.5 44017.591 4.037 38454328 1.093162

2134 rows × 8 columns

In [537]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [538]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [539]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[539]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [540]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [541]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [542]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [543]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [544]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[544]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [545]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [546]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [547]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [548]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9981308764210667
In [549]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [550]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006611715252313853
R2 Score: 0.998422041824106
RMSE: 0.081312
Entropy Value: 0.001601041202106293
In [551]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[551]:
feature importance
1 extreme_poverty 0.723446
0 hospital_beds_per_thousand 0.146505
2 gdp_per_capita 0.065728
3 population_density 0.048554
4 population 0.015767
In [552]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[552]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [553]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [554]:
df_updated
Out[554]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
7310 Finland 1/29/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7311 Finland 1/30/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7312 Finland 1/31/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7313 Finland 2/1/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
7314 Finland 2/2/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.00000
... ... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011

2102 rows × 10 columns

In [555]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [556]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [557]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[557]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [558]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [559]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [560]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [561]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [562]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[562]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [563]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [564]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [565]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [566]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9967115293459013
In [567]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [568]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002631359506918055
R2 Score: 0.9977846875396988
RMSE: 0.051297
Entropy Value: 0.0008231147347694647
In [569]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[569]:
feature importance
1 diabetes_prevalence 0.424154
0 cardiovasc_death_rate 0.346029
2 female_smokers 0.087274
5 aged_65_older 0.081840
6 median_age 0.050001
3 male_smokers 0.008944
4 life_expectancy 0.001758
In [570]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[570]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [571]:
country1 = 'Finland'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [572]:
df_updated
Out[572]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
7310 Finland 1/29/2020 3.28 0.04 40585.721 18.136 5540745 0.00000
7311 Finland 1/30/2020 3.28 0.04 40585.721 18.136 5540745 0.00000
7312 Finland 1/31/2020 3.28 0.04 40585.721 18.136 5540745 0.00000
7313 Finland 2/1/2020 3.28 0.04 40585.721 18.136 5540745 0.00000
7314 Finland 2/2/2020 3.28 0.04 40585.721 18.136 5540745 0.00000
... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.20 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.20 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.20 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.20 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.20 46482.958 3.404 372903 0.11011

2102 rows × 8 columns

In [573]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [574]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [575]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[575]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [576]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [577]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [578]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [579]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [580]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[580]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [581]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [582]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [583]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [584]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9954359249296372
In [585]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [586]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004634197824994556
R2 Score: 0.9960985201154687
RMSE: 0.068075
Entropy Value: 0.0016279455279392522
In [587]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[587]:
feature importance
1 extreme_poverty 0.567476
2 gdp_per_capita 0.207296
0 hospital_beds_per_thousand 0.151705
3 population_density 0.054498
4 population 0.019025
In [588]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[588]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [589]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [590]:
df_updated
Out[590]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
16759 Luxembourg 2/12/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16760 Luxembourg 2/24/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16761 Luxembourg 2/25/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16762 Luxembourg 2/26/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16763 Luxembourg 2/27/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19869 Ireland 12/26/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19870 Ireland 12/27/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19871 Ireland 12/28/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19872 Ireland 12/29/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388

2076 rows × 10 columns

In [591]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [592]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [593]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[593]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [594]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [595]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [596]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [597]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [598]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[598]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [599]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [600]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [601]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [602]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987521060977691
In [603]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [604]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0015514128412379164
R2 Score: 0.9993205997164117
RMSE: 0.039388
Entropy Value: 0.0003261513916954156
In [605]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[605]:
feature importance
6 median_age 0.681071
5 aged_65_older 0.213986
0 cardiovasc_death_rate 0.084737
1 diabetes_prevalence 0.013333
2 female_smokers 0.006390
3 male_smokers 0.000304
4 life_expectancy 0.000178
In [606]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[606]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [607]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [608]:
df_updated
Out[608]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
16759 Luxembourg 2/12/2020 4.51 0.2 94277.965 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 4.51 0.2 94277.965 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 4.51 0.2 94277.965 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 4.51 0.2 94277.965 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 4.51 0.2 94277.965 231.447 647601 0.000000
... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 2.96 0.2 67335.293 69.874 5023108 0.491388
19869 Ireland 12/26/2022 2.96 0.2 67335.293 69.874 5023108 0.491388
19870 Ireland 12/27/2022 2.96 0.2 67335.293 69.874 5023108 0.491388
19871 Ireland 12/28/2022 2.96 0.2 67335.293 69.874 5023108 0.491388
19872 Ireland 12/29/2022 2.96 0.2 67335.293 69.874 5023108 0.491388

2076 rows × 8 columns

In [609]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [610]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [611]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[611]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [612]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [613]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [614]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [615]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [616]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[616]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [617]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [618]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [619]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [620]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9951141561453272
In [621]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [622]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00468197321349643
R2 Score: 0.9979496534742719
RMSE: 0.068425
Entropy Value: 0.0012251208567780243
In [623]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[623]:
feature importance
1 extreme_poverty 0.741234
0 hospital_beds_per_thousand 0.114263
2 gdp_per_capita 0.070959
4 population 0.047536
3 population_density 0.026008
In [624]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[624]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [625]:
country1 = 'Netherlands'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [626]:
df_updated
Out[626]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2099 rows × 10 columns

In [627]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [628]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [629]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[629]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [630]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [631]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [632]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [633]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [634]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[634]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [635]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [636]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [637]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [638]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992039584453336
In [639]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [640]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005761184416829001
R2 Score: 0.9992968245245488
RMSE: 0.075902
Entropy Value: 0.0004786331730612395
In [641]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[641]:
feature importance
1 diabetes_prevalence 0.564395
6 median_age 0.214805
5 aged_65_older 0.195681
2 female_smokers 0.016241
0 cardiovasc_death_rate 0.005649
3 male_smokers 0.002997
4 life_expectancy 0.000231
In [642]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[642]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [643]:
country1 = 'Netherlands'
country2 = 'Slovenia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [644]:
df_updated
Out[644]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.1 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.1 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.1 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.1 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.1 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.0 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.0 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.0 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.0 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.0 31400.840 102.619 2119843 0.536669

2099 rows × 8 columns

In [645]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [646]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [647]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[647]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [648]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [649]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [650]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [651]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [652]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[652]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [653]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [654]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [655]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [656]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9982367909239199
In [657]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [658]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.008784049368404725
R2 Score: 0.9989278718325746
RMSE: 0.093723
Entropy Value: 0.0011132809856388346
In [659]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[659]:
feature importance
1 extreme_poverty 0.751029
2 gdp_per_capita 0.152480
0 hospital_beds_per_thousand 0.088799
3 population_density 0.004278
4 population 0.003415
In [660]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[660]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [661]:
country1 = 'Sweden'
country2 = 'Switzerland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [662]:
df_updated
Out[662]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.816005

2102 rows × 10 columns

In [663]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [664]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [665]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[665]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [666]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [667]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [668]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [669]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [670]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[670]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [671]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [672]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [673]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [674]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9983978600106347
In [675]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [676]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.016940840920362982
R2 Score: 0.99676348032928
RMSE: 0.130157
Entropy Value: 0.0007413159900353029
In [677]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[677]:
feature importance
1 diabetes_prevalence 0.816403
6 median_age 0.105381
5 aged_65_older 0.045172
0 cardiovasc_death_rate 0.018481
2 female_smokers 0.009792
3 male_smokers 0.004547
4 life_expectancy 0.000225
In [678]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[678]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [679]:
country1 = 'Sweden'
country2 = 'Switzerland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [680]:
df_updated
Out[680]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.03 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.03 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.03 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.03 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.03 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.50 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.50 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.50 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.50 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.50 46949.283 24.718 10549349 0.816005

2102 rows × 8 columns

In [681]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [682]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [683]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[683]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [684]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [685]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [686]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [687]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [688]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[688]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [689]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [690]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [691]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [692]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9980805378155827
In [693]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [694]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.016667254115702308
R2 Score: 0.9968157486363314
RMSE: 0.129102
Entropy Value: 0.001646613884826754
In [695]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[695]:
feature importance
1 extreme_poverty 0.659676
2 gdp_per_capita 0.190431
0 hospital_beds_per_thousand 0.124054
3 population_density 0.022797
4 population 0.003043
In [696]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[696]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [697]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [698]:
df_updated
Out[698]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
12547 United Kingdom 2/1/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 50.000000
12548 United Kingdom 2/2/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 100.000000
12549 United Kingdom 2/3/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12550 United Kingdom 2/4/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 25.000000
12551 United Kingdom 2/5/2020 122.137 4.28 20.0 24.7 81.32 18.517 40.8 22.222222
... ... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 10 columns

In [699]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [700]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [701]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[701]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [702]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [703]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [704]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [705]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [706]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[706]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [707]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [708]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [709]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [710]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9573756200168303
In [711]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [712]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  3.067522668688002
R2 Score: 0.8735128665771991
RMSE: 1.751434
Entropy Value: 0.00997828364305316
In [713]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[713]:
feature importance
0 cardiovasc_death_rate 0.515435
1 diabetes_prevalence 0.265632
2 female_smokers 0.128937
5 aged_65_older 0.040833
6 median_age 0.026848
4 life_expectancy 0.013474
3 male_smokers 0.008841
In [714]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[714]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [715]:
country1 = 'United Kingdom'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [716]:
df_updated
Out[716]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
12547 United Kingdom 2/1/2020 2.54 0.2 39753.244 272.898 67508936 50.000000
12548 United Kingdom 2/2/2020 2.54 0.2 39753.244 272.898 67508936 100.000000
12549 United Kingdom 2/3/2020 2.54 0.2 39753.244 272.898 67508936 25.000000
12550 United Kingdom 2/4/2020 2.54 0.2 39753.244 272.898 67508936 25.000000
12551 United Kingdom 2/5/2020 2.54 0.2 39753.244 272.898 67508936 22.222222
... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 1.2 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 1.2 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 1.2 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 1.2 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 1.2 54225.446 35.608 338289856 1.084791

2136 rows × 8 columns

In [717]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [718]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [719]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[719]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [720]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [721]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [722]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [723]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [724]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[724]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [725]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [726]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [727]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [728]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9551791807835526
In [729]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [730]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  1.4775238366856351
R2 Score: 0.9390753468347937
RMSE: 1.215534
Entropy Value: inf
In [731]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[731]:
feature importance
1 extreme_poverty 0.360283
4 population 0.281991
0 hospital_beds_per_thousand 0.175041
2 gdp_per_capita 0.135475
3 population_density 0.047210
In [732]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[732]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [733]:
country1 = 'Cyprus'
country2 = 'Czechia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [734]:
df_updated
Out[734]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919258
5183 Czechia 12/26/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919368
5184 Czechia 12/27/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919431
5185 Czechia 12/28/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919430
5186 Czechia 12/29/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919575

2061 rows × 10 columns

In [735]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [736]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [737]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[737]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [738]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [739]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [740]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [741]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [742]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[742]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [743]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [744]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [745]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [746]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9968265232047949
In [747]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [748]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.000978608688705832
R2 Score: 0.9982973886897886
RMSE: 0.031283
Entropy Value: 0.0005033828926016659
In [749]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[749]:
feature importance
1 diabetes_prevalence 0.658933
0 cardiovasc_death_rate 0.184826
5 aged_65_older 0.094490
6 median_age 0.029701
2 female_smokers 0.024293
3 male_smokers 0.006019
4 life_expectancy 0.001737
In [750]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[750]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [751]:
country1 = 'Cyprus'
country2 = 'Czechia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [752]:
df_updated
Out[752]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.15 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.40 0.15 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.40 0.15 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.40 0.15 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.40 0.15 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 6.63 0.00 32605.906 137.176 10493990 0.919258
5183 Czechia 12/26/2022 6.63 0.00 32605.906 137.176 10493990 0.919368
5184 Czechia 12/27/2022 6.63 0.00 32605.906 137.176 10493990 0.919431
5185 Czechia 12/28/2022 6.63 0.00 32605.906 137.176 10493990 0.919430
5186 Czechia 12/29/2022 6.63 0.00 32605.906 137.176 10493990 0.919575

2061 rows × 8 columns

In [753]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [754]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [755]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[755]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [756]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [757]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [758]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [759]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [760]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[760]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [761]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [762]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [763]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [764]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9943317261946557
In [765]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [766]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0027632388218584106
R2 Score: 0.9951924382797649
RMSE: 0.052567
Entropy Value: 0.001055748763209947
In [767]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[767]:
feature importance
1 extreme_poverty 0.462078
0 hospital_beds_per_thousand 0.393009
2 gdp_per_capita 0.063780
4 population 0.049975
3 population_density 0.031158
In [768]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[768]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [769]:
country1 = 'Estonia'
country2 = 'France'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [770]:
df_updated
Out[770]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411710
9443 France 12/26/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411282
9444 France 12/27/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411730
9445 France 12/28/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411813
9446 France 12/29/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411892

2132 rows × 10 columns

In [771]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [772]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [773]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[773]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [774]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [775]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [776]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [777]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [778]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[778]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [779]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [780]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [781]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [782]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9969900485757501
In [783]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [784]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.06340265752044813
R2 Score: 0.9933988909094517
RMSE: 0.251799
Entropy Value: 0.0037487769082763605
In [785]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[785]:
feature importance
1 diabetes_prevalence 0.718145
0 cardiovasc_death_rate 0.215909
5 aged_65_older 0.026000
6 median_age 0.015482
2 female_smokers 0.013423
3 male_smokers 0.010069
4 life_expectancy 0.000971
In [786]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[786]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [787]:
country1 = 'Estonia'
country2 = 'France'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [788]:
df_updated
Out[788]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.50 29481.252 31.033 1326064 0.000000
6250 Estonia 1/18/2020 4.69 0.50 29481.252 31.033 1326064 0.000000
6251 Estonia 2/5/2020 4.69 0.50 29481.252 31.033 1326064 0.000000
6252 Estonia 2/6/2020 4.69 0.50 29481.252 31.033 1326064 0.000000
6253 Estonia 2/7/2020 4.69 0.50 29481.252 31.033 1326064 0.000000
... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 5.98 0.02 38605.671 122.578 67813000 0.411710
9443 France 12/26/2022 5.98 0.02 38605.671 122.578 67813000 0.411282
9444 France 12/27/2022 5.98 0.02 38605.671 122.578 67813000 0.411730
9445 France 12/28/2022 5.98 0.02 38605.671 122.578 67813000 0.411813
9446 France 12/29/2022 5.98 0.02 38605.671 122.578 67813000 0.411892

2132 rows × 8 columns

In [789]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [790]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [791]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[791]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [792]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [793]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [794]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [795]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [796]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[796]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [797]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [798]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [799]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [800]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9952478754571652
In [801]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [802]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.054477133587009265
R2 Score: 0.9943281635847481
RMSE: 0.233403
Entropy Value: 0.0035443412739072674
In [803]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[803]:
feature importance
1 extreme_poverty 0.685218
0 hospital_beds_per_thousand 0.153925
2 gdp_per_capita 0.080001
4 population 0.063162
3 population_density 0.017695
In [804]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[804]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [805]:
country1 = 'Italy'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [806]:
df_updated
Out[806]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
19873 Latvia 1/6/2020 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.000000
19874 Latvia 1/18/2020 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.000000
19875 Latvia 2/12/2020 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.000000
19876 Latvia 2/29/2020 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.000000
19877 Latvia 3/1/2020 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.000000
... ... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2102 rows × 10 columns

In [807]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [808]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [809]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[809]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [810]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [811]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [812]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [813]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [814]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[814]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [815]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [816]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [817]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [818]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9990157458739676
In [819]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [820]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.016257964094386432
R2 Score: 0.9985806056081434
RMSE: 0.127507
Entropy Value: 0.0005473259353336738
In [821]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[821]:
feature importance
0 cardiovasc_death_rate 0.338637
5 aged_65_older 0.301029
1 diabetes_prevalence 0.300169
3 male_smokers 0.022931
6 median_age 0.018653
2 female_smokers 0.018266
4 life_expectancy 0.000316
In [822]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[822]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [823]:
country1 = 'Italy'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [824]:
df_updated
Out[824]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
19873 Latvia 1/6/2020 5.57 0.7 25063.846 31.212 1850654 0.000000
19874 Latvia 1/18/2020 5.57 0.7 25063.846 31.212 1850654 0.000000
19875 Latvia 2/12/2020 5.57 0.7 25063.846 31.212 1850654 0.000000
19876 Latvia 2/29/2020 5.57 0.7 25063.846 31.212 1850654 0.000000
19877 Latvia 3/1/2020 5.57 0.7 25063.846 31.212 1850654 0.000000
... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 2.0 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 2.0 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 2.0 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 2.0 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 2.0 35220.084 205.859 59037472 0.735109

2102 rows × 8 columns

In [825]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [826]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [827]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[827]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [828]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [829]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [830]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [831]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [832]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[832]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [833]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [834]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [835]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [836]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979179222463455
In [837]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [838]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.027210653455331336
R2 Score: 0.9976243858893387
RMSE: 0.164957
Entropy Value: 0.0013566776347740147
In [839]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[839]:
feature importance
1 extreme_poverty 0.481945
0 hospital_beds_per_thousand 0.321784
3 population_density 0.105203
2 gdp_per_capita 0.083330
4 population 0.007738
In [840]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[840]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [841]:
country1 = 'Portugal'
country2 = 'Slovakia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [842]:
df_updated
Out[842]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
12542 Slovakia 12/25/2022 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.783216
12543 Slovakia 12/26/2022 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.783313
12544 Slovakia 12/27/2022 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.783363
12545 Slovakia 12/28/2022 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.783459
12546 Slovakia 12/29/2022 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.783522

2063 rows × 10 columns

In [843]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [844]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [845]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[845]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [846]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [847]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [848]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [849]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [850]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[850]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [851]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [852]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [853]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [854]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9986452711349703
In [855]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [856]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0011302969685894953
R2 Score: 0.9984342668984687
RMSE: 0.033620
Entropy Value: 0.00025289287267278896
In [857]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[857]:
feature importance
1 diabetes_prevalence 0.666466
0 cardiovasc_death_rate 0.227467
6 median_age 0.077055
5 aged_65_older 0.014932
2 female_smokers 0.011849
3 male_smokers 0.001740
4 life_expectancy 0.000491
In [858]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[858]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [859]:
country1 = 'Portugal'
country2 = 'Slovakia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [860]:
df_updated
Out[860]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.5 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.5 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.5 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.5 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.5 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ...
12542 Slovakia 12/25/2022 5.82 0.7 30155.152 113.128 5643455 0.783216
12543 Slovakia 12/26/2022 5.82 0.7 30155.152 113.128 5643455 0.783313
12544 Slovakia 12/27/2022 5.82 0.7 30155.152 113.128 5643455 0.783363
12545 Slovakia 12/28/2022 5.82 0.7 30155.152 113.128 5643455 0.783459
12546 Slovakia 12/29/2022 5.82 0.7 30155.152 113.128 5643455 0.783522

2063 rows × 8 columns

In [861]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [862]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [863]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[863]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [864]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [865]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [866]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [867]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [868]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[868]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [869]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [870]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [871]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [872]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9977287916661088
In [873]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [874]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.001578803228233158
R2 Score: 0.997812977877545
RMSE: 0.039734
Entropy Value: 0.00035458005898024133
In [875]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[875]:
feature importance
1 extreme_poverty 0.790723
0 hospital_beds_per_thousand 0.136952
2 gdp_per_capita 0.044642
3 population_density 0.026271
4 population 0.001412
In [876]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[876]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [877]:
country1 = 'Spain'
country2 = 'Bulgaria'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [878]:
df_updated
Out[878]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25133 Spain 12/26/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25134 Spain 12/27/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25135 Spain 12/28/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148
25136 Spain 12/29/2022 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.855148

2090 rows × 10 columns

In [879]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [880]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [881]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[881]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [882]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [883]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [884]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [885]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [886]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[886]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [887]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [888]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [889]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [890]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9854266642816312
In [891]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [892]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005444360169759133
R2 Score: 0.9988853299880451
RMSE: 0.073786
Entropy Value: 0.00033004558394914924
In [893]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[893]:
feature importance
1 diabetes_prevalence 0.606117
5 aged_65_older 0.265091
0 cardiovasc_death_rate 0.074869
2 female_smokers 0.023148
6 median_age 0.022173
4 life_expectancy 0.004642
3 male_smokers 0.003961
In [894]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[894]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [895]:
country1 = 'Spain'
country2 = 'Bulgaria'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [896]:
df_updated
Out[896]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 1.5 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 1.5 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 1.5 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 1.5 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 1.5 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ...
25132 Spain 12/25/2022 2.970 1.0 34272.360 93.105 47558632 0.855148
25133 Spain 12/26/2022 2.970 1.0 34272.360 93.105 47558632 0.855148
25134 Spain 12/27/2022 2.970 1.0 34272.360 93.105 47558632 0.855148
25135 Spain 12/28/2022 2.970 1.0 34272.360 93.105 47558632 0.855148
25136 Spain 12/29/2022 2.970 1.0 34272.360 93.105 47558632 0.855148

2090 rows × 8 columns

In [897]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [898]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [899]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[899]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [900]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [901]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [902]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [903]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [904]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[904]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [905]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [906]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [907]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [908]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9839372367216079
In [909]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [910]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.012444501810780155
R2 Score: 0.997452131646388
RMSE: 0.111555
Entropy Value: 0.0007493391098116566
In [911]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[911]:
feature importance
1 extreme_poverty 0.691825
0 hospital_beds_per_thousand 0.200318
2 gdp_per_capita 0.056118
4 population 0.035448
3 population_density 0.016291
In [912]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[912]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [90]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [91]:
df_updated
Out[91]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
15721 Serbia 2/26/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15722 Serbia 2/27/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15723 Serbia 2/28/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15724 Serbia 2/29/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
15725 Serbia 3/1/2020 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403

2076 rows × 10 columns

In [92]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [93]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [94]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[94]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [95]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [96]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [97]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [98]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [99]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[99]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [100]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [101]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [102]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [103]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9986912975875828
In [104]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [105]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0015132058627513301
R2 Score: 0.9991189988145764
RMSE: 0.038900
Entropy Value: 0.00034236383209695746
In [106]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[106]:
feature importance
0 cardiovasc_death_rate 0.494640
5 aged_65_older 0.227501
6 median_age 0.139444
1 diabetes_prevalence 0.132259
2 female_smokers 0.005578
3 male_smokers 0.000382
4 life_expectancy 0.000195
In [107]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[107]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [108]:
country1 = 'Romania'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [109]:
df_updated
Out[109]:
location date hospital_beds_per_thousand extreme_poverty gdp_per_capita population_density population Mortality Rate
15721 Serbia 2/26/2020 5.609 0.05 14048.881 80.291 6871547 0.000000
15722 Serbia 2/27/2020 5.609 0.05 14048.881 80.291 6871547 0.000000
15723 Serbia 2/28/2020 5.609 0.05 14048.881 80.291 6871547 0.000000
15724 Serbia 2/29/2020 5.609 0.05 14048.881 80.291 6871547 0.000000
15725 Serbia 3/1/2020 5.609 0.05 14048.881 80.291 6871547 0.000000
... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 5.70 23313.199 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 5.70 23313.199 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 5.70 23313.199 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 5.70 23313.199 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 5.70 23313.199 85.129 19659270 2.036403

2076 rows × 8 columns

In [110]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [111]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [112]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[112]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [113]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [114]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [115]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [116]:
selected_cols = ['hospital_beds_per_thousand', 'extreme_poverty', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [117]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[117]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [118]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [119]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [120]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [121]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9969017546192163
In [122]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [123]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0023159124982087367
R2 Score: 0.9986516562574312
RMSE: 0.048124
Entropy Value: 0.0005495487035360241
In [124]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[124]:
feature importance
0 hospital_beds_per_thousand 0.703587
1 extreme_poverty 0.249542
2 gdp_per_capita 0.030094
3 population_density 0.015271
4 population 0.001506
In [125]:
# Country Pair by Pair Analysis relative to extreme poverty
In [126]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[126]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [127]:
# Showing the pairings of countries based on extreme poverty (13 pairs of countries)
df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]

df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Serbia = df[(df.location == "Serbia")]

df_Slovenia = df[(df.location == "Slovenia")]
df_Switzerland = df[(df.location == "Switzerland")]

df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]

df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]

df_Estonia = df[(df.location == "Estonia")]
df_Iceland = df[(df.location == "Iceland")]

df_Ireland = df[(df.location == "Ireland")]
df_Latvia = df[(df.location == "Latvia")]

df_Luxembourg = df[(df.location == "Luxembourg")]
df_Portugal = df[(df.location == "Portugal")]

df_Slovakia = df[(df.location == "Slovakia")]
df_Sweden = df[(df.location == "Sweden")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Bulgaria = df[(df.location == "Bulgaria")]

df_Italy = df[(df.location == "Italy")]
df_Romania = df[(df.location == "Romania")]

df_Spain = df[(df.location == "Spain")]
df_UnitedStates = df[(df.location == "United States")]
In [128]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [129]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [130]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[130]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [131]:
country1 = 'Cyprus'
country2 = 'Czechia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [132]:
df_updated
Out[132]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919258
5183 Czechia 12/26/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919368
5184 Czechia 12/27/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919431
5185 Czechia 12/28/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919430
5186 Czechia 12/29/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919575

2061 rows × 10 columns

In [133]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [134]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [135]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[135]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [136]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [137]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [138]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [139]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [140]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[140]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [141]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [142]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [143]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [144]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9968265232047949
In [145]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [146]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.000978608688705832
R2 Score: 0.9982973886897886
RMSE: 0.031283
Entropy Value: 0.0005033828926016659
In [147]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[147]:
feature importance
1 diabetes_prevalence 0.658933
0 cardiovasc_death_rate 0.184826
5 aged_65_older 0.094490
6 median_age 0.029701
2 female_smokers 0.024293
3 male_smokers 0.006019
4 life_expectancy 0.001737
In [148]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[148]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [149]:
country1 = 'Cyprus'
country2 = 'Czechia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [150]:
df_updated
Out[150]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 32415.132 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.40 0.887 32415.132 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.40 0.887 32415.132 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.40 0.887 32415.132 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.40 0.887 32415.132 127.657 896007 0.000000
... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 6.63 0.900 32605.906 137.176 10493990 0.919258
5183 Czechia 12/26/2022 6.63 0.900 32605.906 137.176 10493990 0.919368
5184 Czechia 12/27/2022 6.63 0.900 32605.906 137.176 10493990 0.919431
5185 Czechia 12/28/2022 6.63 0.900 32605.906 137.176 10493990 0.919430
5186 Czechia 12/29/2022 6.63 0.900 32605.906 137.176 10493990 0.919575

2061 rows × 8 columns

In [151]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [152]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [153]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[153]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [154]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [155]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [156]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [157]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [158]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[158]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [159]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [160]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [161]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [162]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9943317261946557
In [163]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [164]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0027632388218584106
R2 Score: 0.9951924382797649
RMSE: 0.052567
Entropy Value: 0.001055748763209947
In [165]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[165]:
feature importance
1 human_development_index 0.462078
0 hospital_beds_per_thousand 0.393009
2 gdp_per_capita 0.063780
4 population 0.049975
3 population_density 0.031158
In [166]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[166]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [167]:
country1 = 'Finland'
country2 = 'France'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [168]:
df_updated
Out[168]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
7310 Finland 1/29/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7311 Finland 1/30/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7312 Finland 1/31/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7313 Finland 2/1/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7314 Finland 2/2/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
... ... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411710
9443 France 12/26/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411282
9444 France 12/27/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411730
9445 France 12/28/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411813
9446 France 12/29/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411892

2137 rows × 10 columns

In [169]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [170]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [171]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[171]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [172]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [173]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [174]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [175]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [176]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[176]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [177]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [178]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [179]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [180]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.995847722245489
In [181]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [182]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.025345261586814726
R2 Score: 0.9974928018718409
RMSE: 0.159202
Entropy Value: 0.0013971728366706324
In [183]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[183]:
feature importance
0 cardiovasc_death_rate 0.435901
1 diabetes_prevalence 0.345667
5 aged_65_older 0.147699
2 female_smokers 0.029630
6 median_age 0.019171
3 male_smokers 0.014463
4 life_expectancy 0.007470
In [184]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[184]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [185]:
country1 = 'Finland'
country2 = 'France'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [186]:
df_updated
Out[186]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
7310 Finland 1/29/2020 3.28 0.938 40585.721 18.136 5540745 0.000000
7311 Finland 1/30/2020 3.28 0.938 40585.721 18.136 5540745 0.000000
7312 Finland 1/31/2020 3.28 0.938 40585.721 18.136 5540745 0.000000
7313 Finland 2/1/2020 3.28 0.938 40585.721 18.136 5540745 0.000000
7314 Finland 2/2/2020 3.28 0.938 40585.721 18.136 5540745 0.000000
... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 5.98 0.901 38605.671 122.578 67813000 0.411710
9443 France 12/26/2022 5.98 0.901 38605.671 122.578 67813000 0.411282
9444 France 12/27/2022 5.98 0.901 38605.671 122.578 67813000 0.411730
9445 France 12/28/2022 5.98 0.901 38605.671 122.578 67813000 0.411813
9446 France 12/29/2022 5.98 0.901 38605.671 122.578 67813000 0.411892

2137 rows × 8 columns

In [187]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [188]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [189]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[189]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [190]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [191]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [192]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [193]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [194]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[194]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [195]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [196]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [197]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [198]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9953573241288052
In [199]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [200]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.08019301369568292
R2 Score: 0.9920671651724497
RMSE: 0.283184
Entropy Value: inf
In [201]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[201]:
feature importance
1 human_development_index 0.459088
4 population 0.234622
0 hospital_beds_per_thousand 0.205086
2 gdp_per_capita 0.079619
3 population_density 0.021585
In [202]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[202]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [203]:
country1 = 'Netherlands'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [204]:
df_updated
Out[204]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.717058
16755 Serbia 12/26/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716963
16756 Serbia 12/27/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716677
16757 Serbia 12/28/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716395
16758 Serbia 12/29/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716205

2075 rows × 10 columns

In [205]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [206]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [207]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[207]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [208]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [209]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [210]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [211]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [212]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[212]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [213]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [214]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [215]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [216]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9994911517097467
In [217]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [218]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0030060796550120455
R2 Score: 0.9996026740974251
RMSE: 0.054828
Entropy Value: 0.00020234686943402895
In [219]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[219]:
feature importance
6 median_age 0.757461
1 diabetes_prevalence 0.230741
5 aged_65_older 0.005390
2 female_smokers 0.004505
0 cardiovasc_death_rate 0.001455
3 male_smokers 0.000398
4 life_expectancy 0.000050
In [220]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[220]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [221]:
country1 = 'Netherlands'
country2 = 'Serbia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [222]:
df_updated
Out[222]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.320 0.944 48472.545 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.320 0.944 48472.545 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.320 0.944 48472.545 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.320 0.944 48472.545 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.320 0.944 48472.545 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 5.609 0.806 14048.881 80.291 6871547 0.717058
16755 Serbia 12/26/2022 5.609 0.806 14048.881 80.291 6871547 0.716963
16756 Serbia 12/27/2022 5.609 0.806 14048.881 80.291 6871547 0.716677
16757 Serbia 12/28/2022 5.609 0.806 14048.881 80.291 6871547 0.716395
16758 Serbia 12/29/2022 5.609 0.806 14048.881 80.291 6871547 0.716205

2075 rows × 8 columns

In [223]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [224]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [225]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[225]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [226]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [227]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [228]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [229]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [230]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[230]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [231]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [232]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [233]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [234]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.997512391985607
In [235]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [236]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.017843262238995315
R2 Score: 0.9976415826965301
RMSE: 0.133579
Entropy Value: 0.003495616937415529
In [237]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[237]:
feature importance
1 human_development_index 0.628833
2 gdp_per_capita 0.207723
0 hospital_beds_per_thousand 0.148684
3 population_density 0.007596
4 population 0.007164
In [238]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[238]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [239]:
country1 = 'Slovenia'
country2 = 'Switzerland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [240]:
df_updated
Out[240]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2101 rows × 10 columns

In [241]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [242]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [243]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[243]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [244]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [245]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [246]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [247]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [248]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[248]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [249]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [250]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [251]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [252]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979540624404055
In [253]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [254]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0069541682155272394
R2 Score: 0.997542383229531
RMSE: 0.083392
Entropy Value: 0.0005493830796332597
In [255]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[255]:
feature importance
1 diabetes_prevalence 0.588315
6 median_age 0.127966
0 cardiovasc_death_rate 0.102203
3 male_smokers 0.079325
2 female_smokers 0.058231
5 aged_65_older 0.043306
4 life_expectancy 0.000655
In [256]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[256]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [257]:
country1 = 'Slovenia'
country2 = 'Switzerland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [258]:
df_updated
Out[258]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.955 57410.166 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.955 57410.166 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.955 57410.166 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.955 57410.166 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.955 57410.166 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 31400.840 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 31400.840 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 31400.840 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 31400.840 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 31400.840 102.619 2119843 0.536669

2101 rows × 8 columns

In [259]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [260]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [261]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[261]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [262]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [263]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [264]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [265]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [266]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[266]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [267]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [268]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [269]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [270]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989566850992713
In [271]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [272]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00732376448420232
R2 Score: 0.9974117671788335
RMSE: 0.085579
Entropy Value: 0.0010900984099785184
In [273]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[273]:
feature importance
1 human_development_index 0.690075
2 gdp_per_capita 0.192149
3 population_density 0.091684
0 hospital_beds_per_thousand 0.024127
4 population 0.001965
In [274]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[274]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [275]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [276]:
df_updated
Out[276]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2095 Belgium 12/26/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2096 Belgium 12/27/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2097 Belgium 12/28/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2098 Belgium 12/29/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787

2099 rows × 10 columns

In [277]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [278]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [279]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[279]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [280]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [281]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [282]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [283]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [284]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[284]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [285]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [286]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [287]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [288]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985877971766527
In [289]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [290]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005399476083504289
R2 Score: 0.9995439543541608
RMSE: 0.073481
Entropy Value: 0.0003649161068700861
In [291]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[291]:
feature importance
6 median_age 0.846396
0 cardiovasc_death_rate 0.062219
1 diabetes_prevalence 0.061911
5 aged_65_older 0.025860
2 female_smokers 0.002106
3 male_smokers 0.001366
4 life_expectancy 0.000142
In [292]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[292]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [293]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [294]:
df_updated
Out[294]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 45436.686 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 45436.686 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 45436.686 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 45436.686 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 45436.686 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 5.64 0.931 42658.576 375.564 11655923 0.711787
2095 Belgium 12/26/2022 5.64 0.931 42658.576 375.564 11655923 0.711787
2096 Belgium 12/27/2022 5.64 0.931 42658.576 375.564 11655923 0.711787
2097 Belgium 12/28/2022 5.64 0.931 42658.576 375.564 11655923 0.711787
2098 Belgium 12/29/2022 5.64 0.931 42658.576 375.564 11655923 0.711787

2099 rows × 8 columns

In [295]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [296]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [297]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[297]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [298]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [299]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [300]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [301]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [302]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[302]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [303]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [304]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [305]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [306]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979482681643022
In [307]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [308]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.017351533220202466
R2 Score: 0.9985344705576373
RMSE: 0.131725
Entropy Value: 0.0015777407587553624
In [309]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[309]:
feature importance
1 human_development_index 0.700511
2 gdp_per_capita 0.139220
0 hospital_beds_per_thousand 0.123668
3 population_density 0.032006
4 population 0.004595
In [310]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[310]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [311]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [312]:
df_updated
Out[312]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5188 Denmark 2/3/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5189 Denmark 2/4/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5190 Denmark 2/5/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5191 Denmark 2/6/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2134 rows × 10 columns

In [313]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [314]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [315]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[315]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [316]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [317]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [318]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [319]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [320]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[320]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [321]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [322]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [323]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [324]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992606850560544
In [325]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [326]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0025729522227882153
R2 Score: 0.9993859368044151
RMSE: 0.050724
Entropy Value: 0.00030984264476932686
In [327]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[327]:
feature importance
1 diabetes_prevalence 0.665696
6 median_age 0.173524
0 cardiovasc_death_rate 0.127952
5 aged_65_older 0.019835
2 female_smokers 0.011763
3 male_smokers 0.001135
4 life_expectancy 0.000095
In [328]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[328]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [329]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [330]:
df_updated
Out[330]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
5187 Denmark 2/2/2020 2.5 0.940 46682.515 136.520 5882259 0.000000
5188 Denmark 2/3/2020 2.5 0.940 46682.515 136.520 5882259 0.000000
5189 Denmark 2/4/2020 2.5 0.940 46682.515 136.520 5882259 0.000000
5190 Denmark 2/5/2020 2.5 0.940 46682.515 136.520 5882259 0.000000
5191 Denmark 2/6/2020 2.5 0.940 46682.515 136.520 5882259 0.000000
... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.929 44017.591 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.929 44017.591 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.929 44017.591 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.929 44017.591 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.929 44017.591 4.037 38454328 1.093162

2134 rows × 8 columns

In [331]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [332]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [333]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[333]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [334]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [335]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [336]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [337]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [338]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[338]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [339]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [340]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [341]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [342]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9981308764210667
In [343]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [344]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006611715252313853
R2 Score: 0.998422041824106
RMSE: 0.081312
Entropy Value: 0.001601041202106293
In [345]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[345]:
feature importance
1 human_development_index 0.723446
0 hospital_beds_per_thousand 0.146505
2 gdp_per_capita 0.065728
3 population_density 0.048554
4 population 0.015767
In [346]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[346]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [347]:
country1 = 'Estonia'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [348]:
df_updated
Out[348]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.00000
... ... ... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21943 Iceland 12/26/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21944 Iceland 12/27/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21945 Iceland 12/28/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011
21946 Iceland 12/29/2022 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.11011

2097 rows × 10 columns

In [349]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [350]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [351]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[351]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [352]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [353]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [354]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [355]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [356]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[356]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [357]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [358]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [359]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [360]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988310609150656
In [361]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [362]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0015892974905094047
R2 Score: 0.997047995814913
RMSE: 0.039866
Entropy Value: 0.0010983888612615478
In [363]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[363]:
feature importance
0 cardiovasc_death_rate 0.544620
1 diabetes_prevalence 0.367091
6 median_age 0.053979
5 aged_65_older 0.021297
2 female_smokers 0.012030
3 male_smokers 0.000796
4 life_expectancy 0.000188
In [364]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[364]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [365]:
country1 = 'Estonia'
country2 = 'Iceland'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [366]:
df_updated
Out[366]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 29481.252 31.033 1326064 0.00000
6250 Estonia 1/18/2020 4.69 0.892 29481.252 31.033 1326064 0.00000
6251 Estonia 2/5/2020 4.69 0.892 29481.252 31.033 1326064 0.00000
6252 Estonia 2/6/2020 4.69 0.892 29481.252 31.033 1326064 0.00000
6253 Estonia 2/7/2020 4.69 0.892 29481.252 31.033 1326064 0.00000
... ... ... ... ... ... ... ... ...
21942 Iceland 12/25/2022 2.91 0.949 46482.958 3.404 372903 0.11011
21943 Iceland 12/26/2022 2.91 0.949 46482.958 3.404 372903 0.11011
21944 Iceland 12/27/2022 2.91 0.949 46482.958 3.404 372903 0.11011
21945 Iceland 12/28/2022 2.91 0.949 46482.958 3.404 372903 0.11011
21946 Iceland 12/29/2022 2.91 0.949 46482.958 3.404 372903 0.11011

2097 rows × 8 columns

In [367]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [368]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [369]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[369]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [370]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [371]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [372]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [373]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [374]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[374]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [375]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [376]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [377]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [378]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9984739118394756
In [379]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [380]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0013145855112608861
R2 Score: 0.9975582532823021
RMSE: 0.036257
Entropy Value: 0.0008164640345889426
In [381]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[381]:
feature importance
1 human_development_index 0.646961
0 hospital_beds_per_thousand 0.271199
2 gdp_per_capita 0.066857
3 population_density 0.013454
4 population 0.001528
In [382]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[382]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [383]:
country1 = 'Ireland'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [384]:
df_updated
Out[384]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
18838 Ireland 2/29/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18839 Ireland 3/1/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18840 Ireland 3/2/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18841 Ireland 3/3/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
18842 Ireland 3/4/2020 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631969

2073 rows × 10 columns

In [385]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [386]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [387]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[387]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [388]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [389]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [390]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [391]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [392]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[392]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [393]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [394]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [395]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [396]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9986904056870012
In [397]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [398]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0019402535771077944
R2 Score: 0.9991197380127789
RMSE: 0.044048
Entropy Value: 0.00025487866277611697
In [399]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[399]:
feature importance
1 diabetes_prevalence 0.403544
5 aged_65_older 0.288725
0 cardiovasc_death_rate 0.277344
2 female_smokers 0.021881
6 median_age 0.006558
3 male_smokers 0.001584
4 life_expectancy 0.000364
In [400]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[400]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [401]:
country1 = 'Ireland'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [402]:
df_updated
Out[402]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
18838 Ireland 2/29/2020 2.96 0.955 67335.293 69.874 5023108 0.000000
18839 Ireland 3/1/2020 2.96 0.955 67335.293 69.874 5023108 0.000000
18840 Ireland 3/2/2020 2.96 0.955 67335.293 69.874 5023108 0.000000
18841 Ireland 3/3/2020 2.96 0.955 67335.293 69.874 5023108 0.000000
18842 Ireland 3/4/2020 2.96 0.955 67335.293 69.874 5023108 0.000000
... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 25063.846 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 25063.846 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 25063.846 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 25063.846 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 25063.846 31.212 1850654 0.631969

2073 rows × 8 columns

In [403]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [404]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [405]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[405]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [406]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [407]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [408]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [409]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [410]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[410]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [411]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [412]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [413]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [414]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979185512625015
In [415]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [416]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006417548514505037
R2 Score: 0.9970884609748346
RMSE: 0.080110
Entropy Value: 0.0010836828796983616
In [417]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[417]:
feature importance
1 human_development_index 0.727506
0 hospital_beds_per_thousand 0.137840
2 gdp_per_capita 0.055472
3 population_density 0.045744
4 population 0.033436
In [418]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[418]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [419]:
country1 = 'Luxembourg'
country2 = 'Portugal'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [420]:
df_updated
Out[420]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17796 Luxembourg 12/26/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17797 Luxembourg 12/27/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17798 Luxembourg 12/28/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872
17799 Luxembourg 12/29/2022 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.377872

2075 rows × 10 columns

In [421]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [422]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [423]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[423]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [424]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [425]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [426]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [427]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [428]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[428]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [429]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [430]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [431]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [432]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9977076018554133
In [433]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [434]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.001019000062480644
R2 Score: 0.9987690062547512
RMSE: 0.031922
Entropy Value: 0.0002762300078945429
In [435]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[435]:
feature importance
1 diabetes_prevalence 0.832116
0 cardiovasc_death_rate 0.121544
5 aged_65_older 0.015343
2 female_smokers 0.015126
6 median_age 0.012136
3 male_smokers 0.002974
4 life_expectancy 0.000761
In [436]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[436]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [437]:
country1 = 'Luxembourg'
country2 = 'Portugal'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [438]:
df_updated
Out[438]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
10484 Portugal 3/1/2020 3.39 0.864 27936.896 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.39 0.864 27936.896 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.39 0.864 27936.896 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.39 0.864 27936.896 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.39 0.864 27936.896 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ...
17795 Luxembourg 12/25/2022 4.51 0.916 94277.965 231.447 647601 0.377872
17796 Luxembourg 12/26/2022 4.51 0.916 94277.965 231.447 647601 0.377872
17797 Luxembourg 12/27/2022 4.51 0.916 94277.965 231.447 647601 0.377872
17798 Luxembourg 12/28/2022 4.51 0.916 94277.965 231.447 647601 0.377872
17799 Luxembourg 12/29/2022 4.51 0.916 94277.965 231.447 647601 0.377872

2075 rows × 8 columns

In [439]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [440]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [441]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[441]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [442]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [443]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [444]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [445]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [446]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[446]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [447]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [448]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [449]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [450]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9962570390993097
In [451]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [452]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.002469659339838852
R2 Score: 0.9970165505261736
RMSE: 0.049696
Entropy Value: 0.0009983817597009118
In [453]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[453]:
feature importance
1 human_development_index 0.645148
0 hospital_beds_per_thousand 0.185715
2 gdp_per_capita 0.102804
3 population_density 0.059981
4 population 0.006353
In [454]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[454]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [455]:
country1 = 'Slovakia'
country2 = 'Sweden'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [456]:
df_updated
Out[456]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.816005

2092 rows × 10 columns

In [457]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [458]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [459]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[459]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [460]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [461]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [462]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [463]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [464]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[464]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [465]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [466]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [467]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [468]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9981093768435327
In [469]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [470]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.013333842213367515
R2 Score: 0.9971199527942521
RMSE: 0.115472
Entropy Value: 0.001124812762035393
In [471]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[471]:
feature importance
1 diabetes_prevalence 0.652631
0 cardiovasc_death_rate 0.279490
6 median_age 0.042922
5 aged_65_older 0.017388
2 female_smokers 0.004467
3 male_smokers 0.002397
4 life_expectancy 0.000704
In [472]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[472]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [473]:
country1 = 'Slovakia'
country2 = 'Sweden'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [474]:
df_updated
Out[474]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.82 0.860 30155.152 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.82 0.860 30155.152 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.82 0.860 30155.152 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.82 0.860 30155.152 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.82 0.860 30155.152 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 46949.283 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 46949.283 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 46949.283 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 46949.283 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 46949.283 24.718 10549349 0.816005

2092 rows × 8 columns

In [475]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [476]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [477]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[477]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [478]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [479]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [480]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [481]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [482]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[482]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [483]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [484]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [485]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [486]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9966554266113448
In [487]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [488]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.02913709652628138
R2 Score: 0.9937065241892549
RMSE: 0.170696
Entropy Value: 0.003420938019185575
In [489]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[489]:
feature importance
1 human_development_index 0.583039
2 gdp_per_capita 0.348568
0 hospital_beds_per_thousand 0.037253
4 population 0.018961
3 population_density 0.012179
In [490]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[490]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [491]:
country1 = 'United Kingdom'
country2 = 'Bulgaria'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [492]:
df_updated
Out[492]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ... ...
13605 United Kingdom 12/25/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13606 United Kingdom 12/26/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13607 United Kingdom 12/27/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13608 United Kingdom 12/28/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13609 United Kingdom 12/29/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564

2090 rows × 10 columns

In [493]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [494]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [495]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[495]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [496]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [497]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [498]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [499]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [500]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[500]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [501]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [502]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [503]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [504]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9362768989166241
In [505]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [506]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.9616768347996019
R2 Score: 0.9502585480128497
RMSE: 0.980651
Entropy Value: inf
In [507]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[507]:
feature importance
1 diabetes_prevalence 0.468417
5 aged_65_older 0.317984
4 life_expectancy 0.071588
6 median_age 0.051958
2 female_smokers 0.041580
0 cardiovasc_death_rate 0.037815
3 male_smokers 0.010659
In [508]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[508]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [509]:
country1 = 'United Kingdom'
country2 = 'Bulgaria'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [510]:
df_updated
Out[510]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 18563.307 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 18563.307 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 18563.307 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 18563.307 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 18563.307 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ...
13605 United Kingdom 12/25/2022 2.540 0.932 39753.244 272.898 67508936 0.883564
13606 United Kingdom 12/26/2022 2.540 0.932 39753.244 272.898 67508936 0.883564
13607 United Kingdom 12/27/2022 2.540 0.932 39753.244 272.898 67508936 0.883564
13608 United Kingdom 12/28/2022 2.540 0.932 39753.244 272.898 67508936 0.883564
13609 United Kingdom 12/29/2022 2.540 0.932 39753.244 272.898 67508936 0.883564

2090 rows × 8 columns

In [511]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [512]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [513]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[513]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [514]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [515]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [516]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [517]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [518]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[518]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [519]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [520]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [521]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [522]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9336897557962663
In [523]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [524]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.46495768148618166
R2 Score: 0.9759506838962997
RMSE: 0.681878
Entropy Value: 0.006551530873100116
In [525]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[525]:
feature importance
1 human_development_index 0.608164
4 population 0.227605
0 hospital_beds_per_thousand 0.068975
2 gdp_per_capita 0.063380
3 population_density 0.031876
In [526]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[526]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [527]:
country1 = 'Italy'
country2 = 'Romania'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [528]:
df_updated
Out[528]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
17800 Romania 2/26/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17801 Romania 2/27/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17802 Romania 2/28/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17803 Romania 2/29/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
17804 Romania 3/1/2020 370.946 9.74 22.9 37.1 76.05 17.850 43.0 0.000000
... ... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2102 rows × 10 columns

In [529]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [530]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [531]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[531]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [532]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [533]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [534]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [535]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [536]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[536]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [537]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [538]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [539]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [540]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9991769240906299
In [541]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [542]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005430612503529626
R2 Score: 0.999481950529423
RMSE: 0.073693
Entropy Value: 0.00018327219830469819
In [543]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[543]:
feature importance
1 diabetes_prevalence 0.498915
0 cardiovasc_death_rate 0.238504
5 aged_65_older 0.181379
6 median_age 0.065676
2 female_smokers 0.012879
3 male_smokers 0.002292
4 life_expectancy 0.000355
In [544]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[544]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [545]:
country1 = 'Italy'
country2 = 'Romania'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [546]:
df_updated
Out[546]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
17800 Romania 2/26/2020 6.892 0.828 23313.199 85.129 19659270 0.000000
17801 Romania 2/27/2020 6.892 0.828 23313.199 85.129 19659270 0.000000
17802 Romania 2/28/2020 6.892 0.828 23313.199 85.129 19659270 0.000000
17803 Romania 2/29/2020 6.892 0.828 23313.199 85.129 19659270 0.000000
17804 Romania 3/1/2020 6.892 0.828 23313.199 85.129 19659270 0.000000
... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.180 0.892 35220.084 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.180 0.892 35220.084 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.180 0.892 35220.084 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.180 0.892 35220.084 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.180 0.892 35220.084 205.859 59037472 0.735109

2102 rows × 8 columns

In [547]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [548]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [549]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[549]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [550]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [551]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [552]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [553]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [554]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[554]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [555]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [556]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [557]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [558]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9973047338698764
In [559]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [560]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.025677028648985536
R2 Score: 0.9975505578626812
RMSE: 0.160241
Entropy Value: 0.0009073480150924102
In [561]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[561]:
feature importance
1 human_development_index 0.613296
0 hospital_beds_per_thousand 0.157358
3 population_density 0.143481
2 gdp_per_capita 0.078615
4 population 0.007250
In [562]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[562]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [563]:
country1 = 'Spain'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [564]:
df_updated
Out[564]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
24074 Spain 2/1/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24075 Spain 2/2/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24076 Spain 2/3/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24077 Spain 2/4/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24078 Spain 2/5/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
... ... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2136 rows × 10 columns

In [565]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [566]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [567]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[567]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [568]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [569]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [570]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [571]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [572]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[572]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [573]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [574]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [575]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [576]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9982647069016395
In [577]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [578]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.06052287316456901
R2 Score: 0.9898126721885138
RMSE: 0.246014
Entropy Value: 0.0021185863451388767
In [579]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[579]:
feature importance
1 diabetes_prevalence 0.803399
0 cardiovasc_death_rate 0.129064
5 aged_65_older 0.034493
2 female_smokers 0.014970
3 male_smokers 0.013636
6 median_age 0.004311
4 life_expectancy 0.000126
In [580]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[580]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [581]:
country1 = 'Spain'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [582]:
df_updated
Out[582]:
location date hospital_beds_per_thousand human_development_index gdp_per_capita population_density population Mortality Rate
24074 Spain 2/1/2020 2.97 0.904 34272.360 93.105 47558632 0.000000
24075 Spain 2/2/2020 2.97 0.904 34272.360 93.105 47558632 0.000000
24076 Spain 2/3/2020 2.97 0.904 34272.360 93.105 47558632 0.000000
24077 Spain 2/4/2020 2.97 0.904 34272.360 93.105 47558632 0.000000
24078 Spain 2/5/2020 2.97 0.904 34272.360 93.105 47558632 0.000000
... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 54225.446 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 54225.446 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 54225.446 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 54225.446 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 54225.446 35.608 338289856 1.084791

2136 rows × 8 columns

In [583]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [584]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [585]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[585]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [586]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [587]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [588]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [589]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'gdp_per_capita', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [590]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[590]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [591]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [592]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [593]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [594]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9974418277786332
In [595]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [596]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0702219546964368
R2 Score: 0.988180103906985
RMSE: 0.264994
Entropy Value: 0.0028392443335123986
In [597]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[597]:
feature importance
1 human_development_index 0.588371
4 population 0.187305
0 hospital_beds_per_thousand 0.093021
2 gdp_per_capita 0.077402
3 population_density 0.053900
In [7]:
# Country Pair by Pair Analysis relative to gdp_per_capita
In [9]:
# Importing the cleaned and preprocessed Our World in Data COVID-19 dataset 
df = pd.read_csv("C:/Users/marco/Downloads/covid-data-cleaned.csv")
df
Out[9]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population stringency_index Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 11.11 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27269 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086136 United States
27270 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.086032 United States
27271 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.085212 United States
27272 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084986 United States
27273 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 23.18 1.084791 United States

27274 rows × 18 columns

In [10]:
# Showing the pairings of countries based on gdp_per_capita (13 pairs of countries)
df_Ireland = df[(df.location == "Ireland")]
df_Luxembourg = df[(df.location == "Luxembourg")]

df_Switzerland = df[(df.location == "Switzerland")]
df_UnitedStates = df[(df.location == "United States")]

df_Austria = df[(df.location == "Austria")]
df_Belgium = df[(df.location == "Belgium")]

df_Canada = df[(df.location == "Canada")]
df_Denmark = df[(df.location == "Denmark")]

df_Finland = df[(df.location == "Finland")]
df_France = df[(df.location == "France")]

df_Iceland = df[(df.location == "Iceland")]
df_Italy = df[(df.location == "Italy")]

df_Netherlands = df[(df.location == "Netherlands")]
df_Sweden = df[(df.location == "Sweden")]

df_UnitedKingdom = df[(df.location == "United Kingdom")]
df_Bulgaria = df[(df.location == "Bulgaria")]

df_Cyprus = df[(df.location == "Cyprus")]
df_Czechia = df[(df.location == "Czechia")]

df_Estonia = df[(df.location == "Estonia")]
df_Latvia = df[(df.location == "Latvia")]

df_Portugal = df[(df.location == "Portugal")]
df_Romania = df[(df.location == "Romania")]

df_Serbia = df[(df.location == "Serbia")]
df_Slovakia = df[(df.location == "Slovakia")]

df_Slovenia = df[(df.location == "Slovenia")]
df_Spain = df[(df.location == "Spain")]
In [11]:
df_UnitedKingdom_new = df_UnitedKingdom.tail(-2)
In [12]:
# Concatenate individual country dataframes by using the first country from each defined pair above and combining all those countries into a single dataframe
dataframes = [df_Austria, df_Belgium, df_Bulgaria, df_Cyprus, df_Czechia, df_Denmark, df_Estonia, df_Finland, df_France, df_Netherlands, df_Portugal, df_Slovakia, df_UnitedKingdom_new, df_Switzerland, df_Canada, df_Serbia, df_Luxembourg, df_Romania, df_Ireland, df_Latvia, df_Iceland, df_Italy, df_Sweden, df_Spain, df_Slovenia, df_UnitedStates]
dataframe_one = pd.concat(dataframes)

# Exporting final_dataframe to CSV file
dataframe_one.to_csv("dataframe-one.csv")
In [13]:
# Importing the dataframe that includes the first countries in each pairing of countries from previous step
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[13]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [14]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [15]:
df_updated
Out[15]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
16759 Luxembourg 2/12/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16760 Luxembourg 2/24/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16761 Luxembourg 2/25/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16762 Luxembourg 2/26/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
16763 Luxembourg 2/27/2020 128.275 4.42 20.9 26.0 82.25 14.312 39.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19869 Ireland 12/26/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19870 Ireland 12/27/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19871 Ireland 12/28/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388
19872 Ireland 12/29/2022 126.459 3.28 23.0 25.7 82.30 13.928 38.7 0.491388

2076 rows × 10 columns

In [16]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [17]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [18]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[18]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [19]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [20]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [21]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [22]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [23]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[23]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [24]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [25]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [26]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [27]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9987521060977691
In [28]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [29]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0015514128412379164
R2 Score: 0.9993205997164117
RMSE: 0.039388
Entropy Value: 0.0003261513916954156
In [30]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[30]:
feature importance
6 median_age 0.681071
5 aged_65_older 0.213986
0 cardiovasc_death_rate 0.084737
1 diabetes_prevalence 0.013333
2 female_smokers 0.006390
3 male_smokers 0.000304
4 life_expectancy 0.000178
In [31]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[31]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [32]:
country1 = 'Ireland'
country2 = 'Luxembourg'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [33]:
df_updated
Out[33]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
16759 Luxembourg 2/12/2020 4.51 0.916 0.2 231.447 647601 0.000000
16760 Luxembourg 2/24/2020 4.51 0.916 0.2 231.447 647601 0.000000
16761 Luxembourg 2/25/2020 4.51 0.916 0.2 231.447 647601 0.000000
16762 Luxembourg 2/26/2020 4.51 0.916 0.2 231.447 647601 0.000000
16763 Luxembourg 2/27/2020 4.51 0.916 0.2 231.447 647601 0.000000
... ... ... ... ... ... ... ... ...
19868 Ireland 12/25/2022 2.96 0.955 0.2 69.874 5023108 0.491388
19869 Ireland 12/26/2022 2.96 0.955 0.2 69.874 5023108 0.491388
19870 Ireland 12/27/2022 2.96 0.955 0.2 69.874 5023108 0.491388
19871 Ireland 12/28/2022 2.96 0.955 0.2 69.874 5023108 0.491388
19872 Ireland 12/29/2022 2.96 0.955 0.2 69.874 5023108 0.491388

2076 rows × 8 columns

In [34]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [35]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [36]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[36]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [37]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [38]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [39]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [40]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [41]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[41]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [42]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [43]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [44]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [45]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9951141561453272
In [46]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [47]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.00468197321349643
R2 Score: 0.9979496534742719
RMSE: 0.068425
Entropy Value: 0.0012251208567780243
In [48]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[48]:
feature importance
1 human_development_index 0.741234
0 hospital_beds_per_thousand 0.114263
2 extreme_poverty 0.070959
4 population 0.047536
3 population_density 0.026008
In [49]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[49]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [50]:
country1 = 'Switzerland'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [51]:
df_updated
Out[51]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
13610 Switzerland 2/25/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13611 Switzerland 2/26/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13612 Switzerland 2/27/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13613 Switzerland 2/28/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
13614 Switzerland 2/29/2020 99.739 5.59 22.6 28.9 83.78 18.436 43.1 0.000000
... ... ... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086136
27268 United States 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.086032
27269 United States 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.085212
27270 United States 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084986
27271 United States 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 38.3 1.084791

2112 rows × 10 columns

In [52]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [53]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [54]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[54]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [55]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [56]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [57]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [58]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [59]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[59]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [60]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [61]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [62]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [63]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9866816704098931
In [64]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [65]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.004320498645038054
R2 Score: 0.9979842946002436
RMSE: 0.065731
Entropy Value: 0.0005109868873380628
In [66]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[66]:
feature importance
1 diabetes_prevalence 0.810118
6 median_age 0.055796
2 female_smokers 0.052323
4 life_expectancy 0.045962
0 cardiovasc_death_rate 0.014979
5 aged_65_older 0.013223
3 male_smokers 0.007598
In [67]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[67]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [68]:
country1 = 'Switzerland'
country2 = 'United States'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [69]:
df_updated
Out[69]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
13610 Switzerland 2/25/2020 4.53 0.955 0.03 214.243 8740471 0.000000
13611 Switzerland 2/26/2020 4.53 0.955 0.03 214.243 8740471 0.000000
13612 Switzerland 2/27/2020 4.53 0.955 0.03 214.243 8740471 0.000000
13613 Switzerland 2/28/2020 4.53 0.955 0.03 214.243 8740471 0.000000
13614 Switzerland 2/29/2020 4.53 0.955 0.03 214.243 8740471 0.000000
... ... ... ... ... ... ... ... ...
27267 United States 12/25/2022 2.77 0.926 1.20 35.608 338289856 1.086136
27268 United States 12/26/2022 2.77 0.926 1.20 35.608 338289856 1.086032
27269 United States 12/27/2022 2.77 0.926 1.20 35.608 338289856 1.085212
27270 United States 12/28/2022 2.77 0.926 1.20 35.608 338289856 1.084986
27271 United States 12/29/2022 2.77 0.926 1.20 35.608 338289856 1.084791

2112 rows × 8 columns

In [70]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [71]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [72]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[72]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [73]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [74]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [75]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [76]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [77]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[77]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [78]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [79]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [80]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [81]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9858714493338189
In [82]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [83]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.013696625921283593
R2 Score: 0.9936099128604793
RMSE: 0.117033
Entropy Value: 0.0009584504854066147
In [84]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[84]:
feature importance
1 human_development_index 0.854932
2 extreme_poverty 0.067287
4 population 0.042602
3 population_density 0.022960
0 hospital_beds_per_thousand 0.012219
In [85]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[85]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [86]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [87]:
df_updated
Out[87]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
0 Austria 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
1 Austria 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
2 Austria 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
3 Austria 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
4 Austria 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 44.4 0.000000
... ... ... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2095 Belgium 12/26/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2096 Belgium 12/27/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2097 Belgium 12/28/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787
2098 Belgium 12/29/2022 114.898 4.29 25.1 31.4 81.63 18.571 41.8 0.711787

2099 rows × 10 columns

In [88]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [89]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [90]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[90]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [91]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [92]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [93]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [94]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [95]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[95]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [96]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [97]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [98]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [99]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9985877971766527
In [100]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [101]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.005399476083504289
R2 Score: 0.9995439543541608
RMSE: 0.073481
Entropy Value: 0.0003649161068700861
In [102]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[102]:
feature importance
6 median_age 0.846396
0 cardiovasc_death_rate 0.062219
1 diabetes_prevalence 0.061911
5 aged_65_older 0.025860
2 female_smokers 0.002106
3 male_smokers 0.001366
4 life_expectancy 0.000142
In [103]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[103]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [104]:
country1 = 'Austria'
country2 = 'Belgium'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [105]:
df_updated
Out[105]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
0 Austria 2/25/2020 7.37 0.922 0.7 106.749 8939617 0.000000
1 Austria 2/26/2020 7.37 0.922 0.7 106.749 8939617 0.000000
2 Austria 2/27/2020 7.37 0.922 0.7 106.749 8939617 0.000000
3 Austria 2/28/2020 7.37 0.922 0.7 106.749 8939617 0.000000
4 Austria 2/29/2020 7.37 0.922 0.7 106.749 8939617 0.000000
... ... ... ... ... ... ... ... ...
2094 Belgium 12/25/2022 5.64 0.931 0.2 375.564 11655923 0.711787
2095 Belgium 12/26/2022 5.64 0.931 0.2 375.564 11655923 0.711787
2096 Belgium 12/27/2022 5.64 0.931 0.2 375.564 11655923 0.711787
2097 Belgium 12/28/2022 5.64 0.931 0.2 375.564 11655923 0.711787
2098 Belgium 12/29/2022 5.64 0.931 0.2 375.564 11655923 0.711787

2099 rows × 8 columns

In [106]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [107]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [108]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[108]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [109]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [110]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [111]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [112]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [113]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[113]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [114]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [115]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [116]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [117]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9979482681643022
In [118]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [119]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.017351533220202466
R2 Score: 0.9985344705576373
RMSE: 0.131725
Entropy Value: 0.0015777407587553624
In [120]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[120]:
feature importance
1 human_development_index 0.700511
2 extreme_poverty 0.139220
0 hospital_beds_per_thousand 0.123668
3 population_density 0.032006
4 population 0.004595
In [121]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[121]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [122]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [123]:
df_updated
Out[123]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
5187 Denmark 2/2/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5188 Denmark 2/3/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5189 Denmark 2/4/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5190 Denmark 2/5/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
5191 Denmark 2/6/2020 114.767 6.41 19.3 18.8 80.90 19.677 42.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092509
15717 Canada 12/26/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092338
15718 Canada 12/27/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092196
15719 Canada 12/28/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.092321
15720 Canada 12/29/2022 105.599 7.37 12.0 16.6 82.43 16.984 41.4 1.093162

2134 rows × 10 columns

In [124]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [125]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [126]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[126]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [127]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [128]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [129]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [130]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [131]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[131]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [132]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [133]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [134]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [135]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9992606850560544
In [136]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [137]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0025729522227882153
R2 Score: 0.9993859368044151
RMSE: 0.050724
Entropy Value: 0.00030984264476932686
In [138]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[138]:
feature importance
1 diabetes_prevalence 0.665696
6 median_age 0.173524
0 cardiovasc_death_rate 0.127952
5 aged_65_older 0.019835
2 female_smokers 0.011763
3 male_smokers 0.001135
4 life_expectancy 0.000095
In [139]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[139]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [140]:
country1 = 'Canada'
country2 = 'Denmark'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [141]:
df_updated
Out[141]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
5187 Denmark 2/2/2020 2.5 0.940 0.2 136.520 5882259 0.000000
5188 Denmark 2/3/2020 2.5 0.940 0.2 136.520 5882259 0.000000
5189 Denmark 2/4/2020 2.5 0.940 0.2 136.520 5882259 0.000000
5190 Denmark 2/5/2020 2.5 0.940 0.2 136.520 5882259 0.000000
5191 Denmark 2/6/2020 2.5 0.940 0.2 136.520 5882259 0.000000
... ... ... ... ... ... ... ... ...
15716 Canada 12/25/2022 2.5 0.929 0.5 4.037 38454328 1.092509
15717 Canada 12/26/2022 2.5 0.929 0.5 4.037 38454328 1.092338
15718 Canada 12/27/2022 2.5 0.929 0.5 4.037 38454328 1.092196
15719 Canada 12/28/2022 2.5 0.929 0.5 4.037 38454328 1.092321
15720 Canada 12/29/2022 2.5 0.929 0.5 4.037 38454328 1.093162

2134 rows × 8 columns

In [142]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [143]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [144]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[144]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [145]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [146]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [147]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [148]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [149]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[149]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [150]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [151]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [152]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [153]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9981308764210667
In [154]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [155]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.006611715252313853
R2 Score: 0.998422041824106
RMSE: 0.081312
Entropy Value: 0.001601041202106293
In [156]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[156]:
feature importance
1 human_development_index 0.723446
0 hospital_beds_per_thousand 0.146505
2 extreme_poverty 0.065728
3 population_density 0.048554
4 population 0.015767
In [157]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[157]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [158]:
country1 = 'Finland'
country2 = 'France'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [159]:
df_updated
Out[159]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
7310 Finland 1/29/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7311 Finland 1/30/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7312 Finland 1/31/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7313 Finland 2/1/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
7314 Finland 2/2/2020 153.507 5.76 18.3 22.6 81.91 21.228 42.8 0.000000
... ... ... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411710
9443 France 12/26/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411282
9444 France 12/27/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411730
9445 France 12/28/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411813
9446 France 12/29/2022 86.060 4.77 30.1 35.6 82.66 19.718 42.0 0.411892

2137 rows × 10 columns

In [160]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [161]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [162]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[162]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [163]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [164]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [165]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [166]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [167]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[167]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [168]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [169]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [170]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [171]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.995847722245489
In [172]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [173]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.025345261586814726
R2 Score: 0.9974928018718409
RMSE: 0.159202
Entropy Value: 0.0013971728366706324
In [174]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[174]:
feature importance
0 cardiovasc_death_rate 0.435901
1 diabetes_prevalence 0.345667
5 aged_65_older 0.147699
2 female_smokers 0.029630
6 median_age 0.019171
3 male_smokers 0.014463
4 life_expectancy 0.007470
In [175]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[175]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [176]:
country1 = 'Finland'
country2 = 'France'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [177]:
df_updated
Out[177]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
7310 Finland 1/29/2020 3.28 0.938 0.04 18.136 5540745 0.000000
7311 Finland 1/30/2020 3.28 0.938 0.04 18.136 5540745 0.000000
7312 Finland 1/31/2020 3.28 0.938 0.04 18.136 5540745 0.000000
7313 Finland 2/1/2020 3.28 0.938 0.04 18.136 5540745 0.000000
7314 Finland 2/2/2020 3.28 0.938 0.04 18.136 5540745 0.000000
... ... ... ... ... ... ... ... ...
9442 France 12/25/2022 5.98 0.901 0.02 122.578 67813000 0.411710
9443 France 12/26/2022 5.98 0.901 0.02 122.578 67813000 0.411282
9444 France 12/27/2022 5.98 0.901 0.02 122.578 67813000 0.411730
9445 France 12/28/2022 5.98 0.901 0.02 122.578 67813000 0.411813
9446 France 12/29/2022 5.98 0.901 0.02 122.578 67813000 0.411892

2137 rows × 8 columns

In [178]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [179]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [180]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[180]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [181]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [182]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [183]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [184]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [185]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[185]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [186]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [187]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [188]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [189]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9953573241288052
In [190]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [191]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.08019301369568292
R2 Score: 0.9920671651724497
RMSE: 0.283184
Entropy Value: inf
In [192]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[192]:
feature importance
1 human_development_index 0.459088
4 population 0.234622
0 hospital_beds_per_thousand 0.205086
2 extreme_poverty 0.079619
3 population_density 0.021585
In [193]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[193]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [194]:
country1 = 'Iceland'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [195]:
df_updated
Out[195]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
20911 Iceland 2/28/2020 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.000000
20912 Iceland 2/29/2020 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.000000
20913 Iceland 3/1/2020 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.000000
20914 Iceland 3/2/2020 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.000000
20915 Iceland 3/3/2020 117.992 5.31 14.3 15.2 82.99 14.431 37.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23007 Italy 12/26/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23008 Italy 12/27/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23009 Italy 12/28/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109
23010 Italy 12/29/2022 113.151 4.78 19.8 27.8 83.51 23.021 47.9 0.735109

2100 rows × 10 columns

In [196]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [197]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [198]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[198]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [199]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [200]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [201]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [202]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [203]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[203]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [204]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [205]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [206]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [207]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9995037277358563
In [208]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [209]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.012240815117304085
R2 Score: 0.9989962218220662
RMSE: 0.110638
Entropy Value: 0.0007257267656131561
In [210]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[210]:
feature importance
0 cardiovasc_death_rate 0.468617
6 median_age 0.310603
1 diabetes_prevalence 0.184342
2 female_smokers 0.019198
5 aged_65_older 0.017014
3 male_smokers 0.000171
4 life_expectancy 0.000056
In [211]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[211]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [212]:
country1 = 'Iceland'
country2 = 'Italy'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [213]:
df_updated
Out[213]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
20911 Iceland 2/28/2020 2.91 0.949 0.2 3.404 372903 0.000000
20912 Iceland 2/29/2020 2.91 0.949 0.2 3.404 372903 0.000000
20913 Iceland 3/1/2020 2.91 0.949 0.2 3.404 372903 0.000000
20914 Iceland 3/2/2020 2.91 0.949 0.2 3.404 372903 0.000000
20915 Iceland 3/3/2020 2.91 0.949 0.2 3.404 372903 0.000000
... ... ... ... ... ... ... ... ...
23006 Italy 12/25/2022 3.18 0.892 2.0 205.859 59037472 0.735109
23007 Italy 12/26/2022 3.18 0.892 2.0 205.859 59037472 0.735109
23008 Italy 12/27/2022 3.18 0.892 2.0 205.859 59037472 0.735109
23009 Italy 12/28/2022 3.18 0.892 2.0 205.859 59037472 0.735109
23010 Italy 12/29/2022 3.18 0.892 2.0 205.859 59037472 0.735109

2100 rows × 8 columns

In [214]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [215]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [216]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[216]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [217]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [218]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [219]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [220]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [221]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[221]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [222]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [223]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [224]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [225]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9992536802375411
In [226]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [227]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.03734838401444865
R2 Score: 0.9969373368933739
RMSE: 0.193257
Entropy Value: 0.0026011322728448083
In [228]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[228]:
feature importance
0 hospital_beds_per_thousand 0.374572
1 human_development_index 0.351103
3 population_density 0.183497
2 extreme_poverty 0.088126
4 population 0.002702
In [229]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[229]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [230]:
country1 = 'Netherlands'
country2 = 'Sweden'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [231]:
df_updated
Out[231]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
9447 Netherlands 2/27/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9448 Netherlands 2/28/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9449 Netherlands 2/29/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9450 Netherlands 3/1/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
9451 Netherlands 3/2/2020 109.361 5.29 24.4 27.3 82.28 18.779 43.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24070 Sweden 12/26/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24071 Sweden 12/27/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24072 Sweden 12/28/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.811466
24073 Sweden 12/29/2022 133.982 4.79 18.8 18.9 82.80 19.985 41.0 0.816005

2100 rows × 10 columns

In [232]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [233]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [234]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[234]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [235]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [236]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [237]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [238]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [239]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[239]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [240]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [241]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [242]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [243]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9974557008389675
In [244]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [245]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.009632293525443848
R2 Score: 0.9990787800266875
RMSE: 0.098144
Entropy Value: 0.0005803548472550031
In [246]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[246]:
feature importance
1 diabetes_prevalence 0.788294
2 female_smokers 0.150990
6 median_age 0.023584
0 cardiovasc_death_rate 0.017991
3 male_smokers 0.017966
5 aged_65_older 0.000835
4 life_expectancy 0.000340
In [247]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[247]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [248]:
country1 = 'Netherlands'
country2 = 'Sweden'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [249]:
df_updated
Out[249]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
9447 Netherlands 2/27/2020 3.32 0.944 0.1 508.544 17564020 0.000000
9448 Netherlands 2/28/2020 3.32 0.944 0.1 508.544 17564020 0.000000
9449 Netherlands 2/29/2020 3.32 0.944 0.1 508.544 17564020 0.000000
9450 Netherlands 3/1/2020 3.32 0.944 0.1 508.544 17564020 0.000000
9451 Netherlands 3/2/2020 3.32 0.944 0.1 508.544 17564020 0.000000
... ... ... ... ... ... ... ... ...
24069 Sweden 12/25/2022 2.22 0.945 0.5 24.718 10549349 0.811466
24070 Sweden 12/26/2022 2.22 0.945 0.5 24.718 10549349 0.811466
24071 Sweden 12/27/2022 2.22 0.945 0.5 24.718 10549349 0.811466
24072 Sweden 12/28/2022 2.22 0.945 0.5 24.718 10549349 0.811466
24073 Sweden 12/29/2022 2.22 0.945 0.5 24.718 10549349 0.816005

2100 rows × 8 columns

In [250]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [251]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [252]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[252]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [253]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5 # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [254]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [255]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [256]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [257]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[257]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [258]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [259]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [260]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [261]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9990940063827581
In [262]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [263]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.021788176538703085
R2 Score: 0.9979162072504859
RMSE: 0.147608
Entropy Value: 0.0017671236634175858
In [264]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[264]:
feature importance
1 human_development_index 0.618477
2 extreme_poverty 0.277463
0 hospital_beds_per_thousand 0.066718
3 population_density 0.034665
4 population 0.002677
In [265]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[265]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [266]:
country1 = 'United Kingdom'
country2 = 'Bulgaria'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [267]:
df_updated
Out[267]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
2099 Bulgaria 3/8/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2100 Bulgaria 3/9/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2101 Bulgaria 3/10/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 0.000000
2102 Bulgaria 3/11/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
2103 Bulgaria 3/12/2020 424.688 5.81 30.1 44.4 75.05 20.801 44.7 14.285714
... ... ... ... ... ... ... ... ... ... ...
13605 United Kingdom 12/25/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13606 United Kingdom 12/26/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13607 United Kingdom 12/27/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13608 United Kingdom 12/28/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564
13609 United Kingdom 12/29/2022 122.137 4.28 20.0 24.7 81.32 18.517 40.8 0.883564

2090 rows × 10 columns

In [268]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [269]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [270]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[270]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [271]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [272]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [273]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [274]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [275]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[275]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [276]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [277]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [278]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [279]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9362768989166241
In [280]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [281]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.9616768347996019
R2 Score: 0.9502585480128497
RMSE: 0.980651
Entropy Value: inf
In [282]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[282]:
feature importance
1 diabetes_prevalence 0.468417
5 aged_65_older 0.317984
4 life_expectancy 0.071588
6 median_age 0.051958
2 female_smokers 0.041580
0 cardiovasc_death_rate 0.037815
3 male_smokers 0.010659
In [283]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[283]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [284]:
country1 = 'United Kingdom'
country2 = 'Bulgaria'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [285]:
df_updated
Out[285]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
2099 Bulgaria 3/8/2020 7.454 0.816 1.5 65.180 6781955 0.000000
2100 Bulgaria 3/9/2020 7.454 0.816 1.5 65.180 6781955 0.000000
2101 Bulgaria 3/10/2020 7.454 0.816 1.5 65.180 6781955 0.000000
2102 Bulgaria 3/11/2020 7.454 0.816 1.5 65.180 6781955 14.285714
2103 Bulgaria 3/12/2020 7.454 0.816 1.5 65.180 6781955 14.285714
... ... ... ... ... ... ... ... ...
13605 United Kingdom 12/25/2022 2.540 0.932 0.2 272.898 67508936 0.883564
13606 United Kingdom 12/26/2022 2.540 0.932 0.2 272.898 67508936 0.883564
13607 United Kingdom 12/27/2022 2.540 0.932 0.2 272.898 67508936 0.883564
13608 United Kingdom 12/28/2022 2.540 0.932 0.2 272.898 67508936 0.883564
13609 United Kingdom 12/29/2022 2.540 0.932 0.2 272.898 67508936 0.883564

2090 rows × 8 columns

In [286]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [287]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [288]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[288]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [289]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [290]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [291]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [292]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [293]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[293]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [294]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [295]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [296]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [297]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9336897557962663
In [298]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [299]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.46495768148618166
R2 Score: 0.9759506838962997
RMSE: 0.681878
Entropy Value: 0.006551530873100116
In [300]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[300]:
feature importance
1 human_development_index 0.608164
4 population 0.227605
0 hospital_beds_per_thousand 0.068975
2 extreme_poverty 0.063380
3 population_density 0.031876
In [301]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[301]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [302]:
country1 = 'Cyprus'
country2 = 'Czechia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [303]:
df_updated
Out[303]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
3126 Cyprus 3/8/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3127 Cyprus 3/9/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3128 Cyprus 3/10/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3129 Cyprus 3/11/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
3130 Cyprus 3/12/2020 141.171 9.24 19.6 52.7 80.98 13.416 37.3 0.000000
... ... ... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919258
5183 Czechia 12/26/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919368
5184 Czechia 12/27/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919431
5185 Czechia 12/28/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919430
5186 Czechia 12/29/2022 227.485 6.82 30.5 38.3 79.38 19.027 43.3 0.919575

2061 rows × 10 columns

In [304]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [305]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [306]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[306]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [307]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [308]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [309]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [310]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [311]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[311]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [312]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [313]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [314]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [315]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9968265232047949
In [316]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [317]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.000978608688705832
R2 Score: 0.9982973886897886
RMSE: 0.031283
Entropy Value: 0.0005033828926016659
In [318]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[318]:
feature importance
1 diabetes_prevalence 0.658933
0 cardiovasc_death_rate 0.184826
5 aged_65_older 0.094490
6 median_age 0.029701
2 female_smokers 0.024293
3 male_smokers 0.006019
4 life_expectancy 0.001737
In [319]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[319]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [320]:
country1 = 'Cyprus'
country2 = 'Czechia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [321]:
df_updated
Out[321]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
3126 Cyprus 3/8/2020 3.40 0.887 0.15 127.657 896007 0.000000
3127 Cyprus 3/9/2020 3.40 0.887 0.15 127.657 896007 0.000000
3128 Cyprus 3/10/2020 3.40 0.887 0.15 127.657 896007 0.000000
3129 Cyprus 3/11/2020 3.40 0.887 0.15 127.657 896007 0.000000
3130 Cyprus 3/12/2020 3.40 0.887 0.15 127.657 896007 0.000000
... ... ... ... ... ... ... ... ...
5182 Czechia 12/25/2022 6.63 0.900 0.00 137.176 10493990 0.919258
5183 Czechia 12/26/2022 6.63 0.900 0.00 137.176 10493990 0.919368
5184 Czechia 12/27/2022 6.63 0.900 0.00 137.176 10493990 0.919431
5185 Czechia 12/28/2022 6.63 0.900 0.00 137.176 10493990 0.919430
5186 Czechia 12/29/2022 6.63 0.900 0.00 137.176 10493990 0.919575

2061 rows × 8 columns

In [322]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [323]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [324]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[324]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [325]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [326]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [327]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [328]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [329]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[329]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [330]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [331]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [332]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [333]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9943317261946557
In [334]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [335]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0027632388218584106
R2 Score: 0.9951924382797649
RMSE: 0.052567
Entropy Value: 0.001055748763209947
In [336]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[336]:
feature importance
1 human_development_index 0.462078
0 hospital_beds_per_thousand 0.393009
2 extreme_poverty 0.063780
4 population 0.049975
3 population_density 0.031158
In [337]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[337]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [338]:
country1 = 'Estonia'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [339]:
df_updated
Out[339]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
6249 Estonia 1/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6250 Estonia 1/18/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6251 Estonia 2/5/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6252 Estonia 2/6/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
6253 Estonia 2/7/2020 255.569 4.02 24.5 39.3 78.74 19.452 42.7 0.000000
... ... ... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20907 Latvia 12/26/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631631
20908 Latvia 12/27/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20909 Latvia 12/28/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631485
20910 Latvia 12/29/2022 350.060 4.91 25.6 51.0 75.29 19.754 43.9 0.631969

2099 rows × 10 columns

In [340]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [341]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [342]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[342]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [343]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [344]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [345]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [346]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [347]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[347]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [348]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [349]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [350]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [351]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9983509839471967
In [352]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [353]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0011442225986659504
R2 Score: 0.998102656857225
RMSE: 0.033826
Entropy Value: inf
In [354]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[354]:
feature importance
1 diabetes_prevalence 0.764753
0 cardiovasc_death_rate 0.126479
5 aged_65_older 0.055045
6 median_age 0.033800
2 female_smokers 0.018890
3 male_smokers 0.000750
4 life_expectancy 0.000282
In [355]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[355]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [356]:
country1 = 'Estonia'
country2 = 'Latvia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [357]:
df_updated
Out[357]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
6249 Estonia 1/6/2020 4.69 0.892 0.5 31.033 1326064 0.000000
6250 Estonia 1/18/2020 4.69 0.892 0.5 31.033 1326064 0.000000
6251 Estonia 2/5/2020 4.69 0.892 0.5 31.033 1326064 0.000000
6252 Estonia 2/6/2020 4.69 0.892 0.5 31.033 1326064 0.000000
6253 Estonia 2/7/2020 4.69 0.892 0.5 31.033 1326064 0.000000
... ... ... ... ... ... ... ... ...
20906 Latvia 12/25/2022 5.57 0.866 0.7 31.212 1850654 0.631631
20907 Latvia 12/26/2022 5.57 0.866 0.7 31.212 1850654 0.631631
20908 Latvia 12/27/2022 5.57 0.866 0.7 31.212 1850654 0.631485
20909 Latvia 12/28/2022 5.57 0.866 0.7 31.212 1850654 0.631485
20910 Latvia 12/29/2022 5.57 0.866 0.7 31.212 1850654 0.631969

2099 rows × 8 columns

In [358]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [359]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [360]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[360]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [361]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [362]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [363]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [364]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [365]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[365]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [366]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [367]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [368]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [369]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.9}
Best CV score: 0.9974510837613941
In [370]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [371]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0018480402171181661
R2 Score: 0.9969355906467765
RMSE: 0.042989
Entropy Value: inf
In [372]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[372]:
feature importance
1 human_development_index 0.772094
3 population_density 0.102371
0 hospital_beds_per_thousand 0.068553
2 extreme_poverty 0.051827
4 population 0.005155
In [373]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[373]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [374]:
country1 = 'Portugal'
country2 = 'Romania'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [375]:
df_updated
Out[375]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
10484 Portugal 3/1/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10485 Portugal 3/2/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10486 Portugal 3/3/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10487 Portugal 3/4/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
10488 Portugal 3/5/2020 127.842 9.85 16.3 30.0 82.05 21.502 46.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.037520
18834 Romania 12/26/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18835 Romania 12/27/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18836 Romania 12/28/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403
18837 Romania 12/29/2022 370.946 9.74 22.9 37.1 76.05 17.850 43.0 2.036403

2072 rows × 10 columns

In [376]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [377]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [378]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[378]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [379]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [380]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [381]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [382]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [383]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[383]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [384]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [385]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [386]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [387]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9988248313239702
In [388]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [389]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0022348534753311943
R2 Score: 0.9986910224301357
RMSE: 0.047274
Entropy Value: 0.0003035979830766401
In [390]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[390]:
feature importance
1 diabetes_prevalence 0.525057
0 cardiovasc_death_rate 0.282578
5 aged_65_older 0.132401
6 median_age 0.035947
2 female_smokers 0.019500
3 male_smokers 0.003542
4 life_expectancy 0.000975
In [391]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[391]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [392]:
country1 = 'Portugal'
country2 = 'Romania'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [393]:
df_updated
Out[393]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
10484 Portugal 3/1/2020 3.390 0.864 0.5 112.371 10270857 0.000000
10485 Portugal 3/2/2020 3.390 0.864 0.5 112.371 10270857 0.000000
10486 Portugal 3/3/2020 3.390 0.864 0.5 112.371 10270857 0.000000
10487 Portugal 3/4/2020 3.390 0.864 0.5 112.371 10270857 0.000000
10488 Portugal 3/5/2020 3.390 0.864 0.5 112.371 10270857 0.000000
... ... ... ... ... ... ... ... ...
18833 Romania 12/25/2022 6.892 0.828 5.7 85.129 19659270 2.037520
18834 Romania 12/26/2022 6.892 0.828 5.7 85.129 19659270 2.036403
18835 Romania 12/27/2022 6.892 0.828 5.7 85.129 19659270 2.036403
18836 Romania 12/28/2022 6.892 0.828 5.7 85.129 19659270 2.036403
18837 Romania 12/29/2022 6.892 0.828 5.7 85.129 19659270 2.036403

2072 rows × 8 columns

In [394]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [395]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [396]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[396]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [397]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [398]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [399]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [400]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [401]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[401]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [402]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [403]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [404]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [405]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9981547439951199
In [406]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [407]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.003836200339738359
R2 Score: 0.9977530964541294
RMSE: 0.061937
Entropy Value: 0.000381309516154465
In [408]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[408]:
feature importance
1 human_development_index 0.584363
0 hospital_beds_per_thousand 0.321504
2 extreme_poverty 0.054565
3 population_density 0.035218
4 population 0.004351
In [409]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[409]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [410]:
country1 = 'Serbia'
country2 = 'Slovakia'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [411]:
df_updated
Out[411]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
11518 Slovakia 3/6/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11519 Slovakia 3/7/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11520 Slovakia 3/8/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11521 Slovakia 3/9/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
11522 Slovakia 3/10/2020 287.959 7.29 23.1 37.7 77.54 15.070 41.2 0.000000
... ... ... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.717058
16755 Serbia 12/26/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716963
16756 Serbia 12/27/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716677
16757 Serbia 12/28/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716395
16758 Serbia 12/29/2022 439.415 10.08 37.7 40.2 76.00 17.366 41.2 0.716205

2067 rows × 10 columns

In [412]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [413]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [414]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[414]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [415]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [416]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [417]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [418]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [419]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[419]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [420]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [421]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [422]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [423]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9962453859478295
In [424]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [425]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.000787005226820817
R2 Score: 0.9967791971622308
RMSE: 0.028054
Entropy Value: inf
In [426]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[426]:
feature importance
1 diabetes_prevalence 0.825131
6 median_age 0.119560
5 aged_65_older 0.036567
2 female_smokers 0.013630
3 male_smokers 0.001937
4 life_expectancy 0.001725
0 cardiovasc_death_rate 0.001451
In [427]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[427]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [428]:
country1 = 'Serbia'
country2 = 'Slovakia'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [429]:
df_updated
Out[429]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
11518 Slovakia 3/6/2020 5.820 0.860 0.70 113.128 5643455 0.000000
11519 Slovakia 3/7/2020 5.820 0.860 0.70 113.128 5643455 0.000000
11520 Slovakia 3/8/2020 5.820 0.860 0.70 113.128 5643455 0.000000
11521 Slovakia 3/9/2020 5.820 0.860 0.70 113.128 5643455 0.000000
11522 Slovakia 3/10/2020 5.820 0.860 0.70 113.128 5643455 0.000000
... ... ... ... ... ... ... ... ...
16754 Serbia 12/25/2022 5.609 0.806 0.05 80.291 6871547 0.717058
16755 Serbia 12/26/2022 5.609 0.806 0.05 80.291 6871547 0.716963
16756 Serbia 12/27/2022 5.609 0.806 0.05 80.291 6871547 0.716677
16757 Serbia 12/28/2022 5.609 0.806 0.05 80.291 6871547 0.716395
16758 Serbia 12/29/2022 5.609 0.806 0.05 80.291 6871547 0.716205

2067 rows × 8 columns

In [430]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [431]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [432]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[432]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [433]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [434]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [435]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [436]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [437]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[437]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [438]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [439]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [440]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [441]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9935786810447313
In [442]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [443]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0014413661864962608
R2 Score: 0.9941012382821331
RMSE: 0.037965
Entropy Value: 0.0008811405071239758
In [444]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[444]:
feature importance
1 human_development_index 0.571363
0 hospital_beds_per_thousand 0.291424
2 extreme_poverty 0.051676
4 population 0.046745
3 population_density 0.038791
In [445]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[445]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [446]:
country1 = 'Slovenia'
country2 = 'Spain'

# Extracting important features for XGBoost Model Analysis for the population health index
df_updated = df_updated[['location', 'date', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [447]:
df_updated
Out[447]:
location date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older median_age Mortality Rate
24074 Spain 2/1/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24075 Spain 2/2/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24076 Spain 2/3/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24077 Spain 2/4/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
24078 Spain 2/5/2020 99.403 7.17 27.4 31.4 83.56 19.436 45.5 0.000000
... ... ... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537665
26195 Slovenia 12/26/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537924
26196 Slovenia 12/27/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537894
26197 Slovenia 12/28/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.537128
26198 Slovenia 12/29/2022 153.493 7.25 20.1 25.0 81.32 19.062 44.5 0.536669

2125 rows × 10 columns

In [448]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [449]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [450]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[450]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [451]:
# Setting the number of principal components to 7 as this equals the number of input variables for the XGBoost Model Analysis for the population health index
n_components = 7  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [452]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age'])
principal_df['location'] = df_updated['location'].values
In [453]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [454]:
selected_cols = ['cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'life_expectancy', 'aged_65_older', 'median_age']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [455]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[455]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [456]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [457]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [458]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [459]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9989690618121256
In [460]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [461]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.0060921214023629666
R2 Score: 0.9990656666044356
RMSE: 0.078052
Entropy Value: 0.000580086181770288
In [462]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[462]:
feature importance
1 diabetes_prevalence 0.754377
0 cardiovasc_death_rate 0.120936
6 median_age 0.072379
2 female_smokers 0.030701
5 aged_65_older 0.018044
3 male_smokers 0.003379
4 life_expectancy 0.000184
In [463]:
# Importing the dataframe of all 26 countries
df_updated = pd.read_csv("C:/Users/marco/Downloads/dataframe-one.csv")
df_updated
Out[463]:
date cardiovasc_death_rate diabetes_prevalence female_smokers male_smokers life_expectancy aged_65_older aged_70_older median_age hospital_beds_per_thousand human_development_index extreme_poverty gdp_per_capita population_density population Mortality Rate location
0 2/25/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
1 2/26/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
2 2/27/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
3 2/28/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
4 2/29/2020 145.183 6.35 28.4 30.9 81.54 19.202 13.748 44.4 7.37 0.922 0.7 45436.686 106.749 8939617 0.000000 Austria
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
27267 12/25/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086136 United States
27268 12/26/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.086032 United States
27269 12/27/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.085212 United States
27270 12/28/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084986 United States
27271 12/29/2022 151.089 10.79 19.1 24.6 78.86 15.413 9.732 38.3 2.77 0.926 1.2 54225.446 35.608 338289856 1.084791 United States

27272 rows × 17 columns

In [464]:
country1 = 'Slovenia'
country2 = 'Spain'

# Extracting important features for XGBoost Model Analysis for the country health index
df_updated = df_updated[['location', 'date', 'hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population', 'Mortality Rate']]
df_updated = df_updated[df_updated['location'].isin([country1, country2])]
In [465]:
df_updated
Out[465]:
location date hospital_beds_per_thousand human_development_index extreme_poverty population_density population Mortality Rate
24074 Spain 2/1/2020 2.97 0.904 1.0 93.105 47558632 0.000000
24075 Spain 2/2/2020 2.97 0.904 1.0 93.105 47558632 0.000000
24076 Spain 2/3/2020 2.97 0.904 1.0 93.105 47558632 0.000000
24077 Spain 2/4/2020 2.97 0.904 1.0 93.105 47558632 0.000000
24078 Spain 2/5/2020 2.97 0.904 1.0 93.105 47558632 0.000000
... ... ... ... ... ... ... ... ...
26194 Slovenia 12/25/2022 4.50 0.917 0.0 102.619 2119843 0.537665
26195 Slovenia 12/26/2022 4.50 0.917 0.0 102.619 2119843 0.537924
26196 Slovenia 12/27/2022 4.50 0.917 0.0 102.619 2119843 0.537894
26197 Slovenia 12/28/2022 4.50 0.917 0.0 102.619 2119843 0.537128
26198 Slovenia 12/29/2022 4.50 0.917 0.0 102.619 2119843 0.536669

2125 rows × 8 columns

In [466]:
'''
I need to create lagged variables for the previous day's mortality rate, the previous week's mortality rate, and the previous month's mortality rate. 
I can create these lagged variables using the shift() method in pandas in order to convert the Our World in Data COVID-19 timeseries dataset to a supervised learning problem so that the XGBoost Model can be used.
I need to transform the data into a tabular format where each row represents a single observation and each column represents a feature.
That is the reason why it is necessary to convert the timeseries dataset into a supervised learning problem in order for the XGBoost Model to be used directly to assess which variables are the highest predictors of COVID-19 mortality (death rate) per country.
'''
# Create lagged variables for the previous day mortality, previous week mortality, and previous month mortality rates, respectively
df_updated['prev_day_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(1)
df_updated['prev_week_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(7)
df_updated['prev_month_mortality'] = df_updated.groupby(['location'])['Mortality Rate'].shift(30)
In [467]:
# Replace NaN values in prev_day_mortality, prev_week_mortality, and prev_month_mortality columns with 0
df_updated['prev_day_mortality'] = df_updated['prev_day_mortality'].fillna(0)
df_updated['prev_week_mortality'] = df_updated['prev_week_mortality'].fillna(0)
df_updated['prev_month_mortality'] = df_updated['prev_month_mortality'].fillna(0)
In [468]:
# Performing Principal Component Analysis (PCA) in order to solve the problem of multi-collinearity
pca = PCA()
pca.fit(df_updated.iloc[:,2:])
Out[468]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [469]:
# Setting the number of principal components to 5 as this equals the number of input variables for the XGBoost Model Analysis for the country health index
n_components = 5  # of input variables for XGBoost Model Analysis
principal_components = pca.transform(df_updated.iloc[:,2:])[:, :n_components]
In [470]:
# Obtaining the resulting dataframe after performing Principal Component Analysis for Dataframe
principal_df = pd.DataFrame(data=principal_components, columns=['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population'])
principal_df['location'] = df_updated['location'].values
In [471]:
# Convert categorical variables to numerical variables using one-hot encoding with the get_dummies() method
df_updated = pd.get_dummies(df_updated, columns=['location'])
In [472]:
selected_cols = ['hospital_beds_per_thousand', 'human_development_index', 'extreme_poverty', 'population_density', 'population']
X = principal_df[selected_cols].values
y = df_updated['Mortality Rate'].values

# Split the dataset into training set and testing set for XGBoost Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
In [473]:
# Fit scaling on the training set
scaler = StandardScaler()
scaler.fit(X_train)
Out[473]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [474]:
# Apply scaling on the training set
X_train_scaled = scaler.transform(X_train)
In [475]:
# Apply scaling on the test set
X_test_scaled = scaler.transform(X_test)
In [476]:
# Define XGBoost model
xgb_model = xgb.XGBRegressor()

# Define hyperparameters to tune
params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.1, 0.01, 0.001],
          'n_estimators': [50, 100, 150],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.8, 0.9],
          'colsample_bytree': [0.8, 0.9]}
In [477]:
# Perform grid search and 10-fold cross-validation (k = 10)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=10, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)
Best hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150, 'subsample': 0.8}
Best CV score: 0.9986016344611052
In [478]:
# Fit the model using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

# Making Predictions
y_pred = best_model.predict(X_test_scaled)
In [479]:
# Evaluate the performance of the XG Boost Model by obtaining the Mean Squared Error (MSE), Root Mean Squared Error (RMSE), R^2 Score, and Entropy
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
score = r2_score(y_test, y_pred)
entropy_val = entropy(y_test, y_pred)
print("MSE: ", mse)
print('R2 Score:', score)
print("RMSE: %f" % (rmse))
print("Entropy Value:", entropy_val)
MSE:  0.01192771638114138
R2 Score: 0.9981706760237249
RMSE: 0.109214
Entropy Value: 0.0014002886806236585
In [480]:
feature_importances = best_model.feature_importances_
feature_importances = pd.DataFrame({'feature': selected_cols , 'importance': feature_importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)
feature_importances
Out[480]:
feature importance
1 human_development_index 0.726845
2 extreme_poverty 0.142845
0 hospital_beds_per_thousand 0.091830
3 population_density 0.030780
4 population 0.007701
In [ ]: